#!pip install folium
#!pip install imbalanced-learn
#Making necessary imports
import numpy as np # linear algebra
import pandas as pd # data processing
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
import folium
import sklearn
from sklearn import svm
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neighbors import NeighborhoodComponentsAnalysis,KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score , f1_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import warnings
from matplotlib.colors import LinearSegmentedColormap
from sklearn.metrics import precision_score, recall_score, ConfusionMatrixDisplay
from itertools import cycle
from sklearn.tree import plot_tree
from scipy.stats import randint
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_curve
import warnings
from sklearn.exceptions import ConvergenceWarning
# Loading in the Dataset
Churn =pd.read_csv("Churn.csv")
#Checking head of dataset (first 10)
Churn.head(n=10)
| Customer ID | Gender | Age | Married | Number of Dependents | City | Zip Code | Latitude | Longitude | Number of Referrals | ... | Payment Method | Monthly Charge | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Customer Status | Churn Category | Churn Reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0002-ORFBO | Female | 37 | Yes | 0 | Frazier Park | 93225 | 34.827662 | -118.999073 | 2 | ... | Credit Card | 65.60 | 593.30 | 0.00 | 0 | 381.51 | 974.81 | Stayed | NaN | NaN |
| 1 | 0003-MKNFE | Male | 46 | No | 0 | Glendale | 91206 | 34.162515 | -118.203869 | 0 | ... | Credit Card | -4.00 | 542.40 | 38.33 | 10 | 96.21 | 610.28 | Stayed | NaN | NaN |
| 2 | 0004-TLHLJ | Male | 50 | No | 0 | Costa Mesa | 92627 | 33.645672 | -117.922613 | 0 | ... | Bank Withdrawal | 73.90 | 280.85 | 0.00 | 0 | 134.60 | 415.45 | Churned | Competitor | Competitor had better devices |
| 3 | 0011-IGKFF | Male | 78 | Yes | 0 | Martinez | 94553 | 38.014457 | -122.115432 | 1 | ... | Bank Withdrawal | 98.00 | 1237.85 | 0.00 | 0 | 361.66 | 1599.51 | Churned | Dissatisfaction | Product dissatisfaction |
| 4 | 0013-EXCHZ | Female | 75 | Yes | 0 | Camarillo | 93010 | 34.227846 | -119.079903 | 3 | ... | Credit Card | 83.90 | 267.40 | 0.00 | 0 | 22.14 | 289.54 | Churned | Dissatisfaction | Network reliability |
| 5 | 0013-MHZWF | Female | 23 | No | 3 | Midpines | 95345 | 37.581496 | -119.972762 | 0 | ... | Credit Card | 69.40 | 571.45 | 0.00 | 0 | 150.93 | 722.38 | Stayed | NaN | NaN |
| 6 | 0013-SMEOE | Female | 67 | Yes | 0 | Lompoc | 93437 | 34.757477 | -120.550507 | 1 | ... | Bank Withdrawal | 109.70 | 7904.25 | 0.00 | 0 | 707.16 | 8611.41 | Stayed | NaN | NaN |
| 7 | 0014-BMAQU | Male | 52 | Yes | 0 | Napa | 94558 | 38.489789 | -122.270110 | 8 | ... | Credit Card | 84.65 | 5377.80 | 0.00 | 20 | 816.48 | 6214.28 | Stayed | NaN | NaN |
| 8 | 0015-UOCOJ | Female | 68 | No | 0 | Simi Valley | 93063 | 34.296813 | -118.685703 | 0 | ... | Bank Withdrawal | 48.20 | 340.35 | 0.00 | 0 | 73.71 | 414.06 | Stayed | NaN | NaN |
| 9 | 0016-QLJIS | Female | 43 | Yes | 1 | Sheridan | 95681 | 38.984756 | -121.345074 | 3 | ... | Credit Card | 90.45 | 5957.90 | 0.00 | 0 | 1849.90 | 7807.80 | Stayed | NaN | NaN |
10 rows × 38 columns
# Checking size of dataset
Churn.shape
(7043, 38)
# Checking summary statistics of numerical features
Churn.describe()
| Age | Number of Dependents | Zip Code | Latitude | Longitude | Number of Referrals | Tenure in Months | Avg Monthly Long Distance Charges | Avg Monthly GB Download | Monthly Charge | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 6361.000000 | 5517.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 46.509726 | 0.468692 | 93486.070567 | 36.197455 | -119.756684 | 1.951867 | 32.386767 | 25.420517 | 26.189958 | 63.596131 | 2280.381264 | 1.962182 | 6.860713 | 749.099262 | 3034.379056 |
| std | 16.750352 | 0.962802 | 1856.767505 | 2.468929 | 2.154425 | 3.001199 | 24.542061 | 14.200374 | 19.586585 | 31.204743 | 2266.220462 | 7.902614 | 25.104978 | 846.660055 | 2865.204542 |
| min | 19.000000 | 0.000000 | 90001.000000 | 32.555828 | -124.301372 | 0.000000 | 1.000000 | 1.010000 | 2.000000 | -10.000000 | 18.800000 | 0.000000 | 0.000000 | 0.000000 | 21.360000 |
| 25% | 32.000000 | 0.000000 | 92101.000000 | 33.990646 | -121.788090 | 0.000000 | 9.000000 | 13.050000 | 13.000000 | 30.400000 | 400.150000 | 0.000000 | 0.000000 | 70.545000 | 605.610000 |
| 50% | 46.000000 | 0.000000 | 93518.000000 | 36.205465 | -119.595293 | 0.000000 | 29.000000 | 25.690000 | 21.000000 | 70.050000 | 1394.550000 | 0.000000 | 0.000000 | 401.440000 | 2108.640000 |
| 75% | 60.000000 | 0.000000 | 95329.000000 | 38.161321 | -117.969795 | 3.000000 | 55.000000 | 37.680000 | 30.000000 | 89.750000 | 3786.600000 | 0.000000 | 0.000000 | 1191.100000 | 4801.145000 |
| max | 80.000000 | 9.000000 | 96150.000000 | 41.962127 | -114.192901 | 11.000000 | 72.000000 | 49.990000 | 85.000000 | 118.750000 | 8684.800000 | 49.790000 | 150.000000 | 3564.720000 | 11979.340000 |
#Checking data types of each features and the number of non-null values.
Churn.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 7043 non-null object 1 Gender 7043 non-null object 2 Age 7043 non-null int64 3 Married 7043 non-null object 4 Number of Dependents 7043 non-null int64 5 City 7043 non-null object 6 Zip Code 7043 non-null int64 7 Latitude 7043 non-null float64 8 Longitude 7043 non-null float64 9 Number of Referrals 7043 non-null int64 10 Tenure in Months 7043 non-null int64 11 Offer 3166 non-null object 12 Phone Service 7043 non-null object 13 Avg Monthly Long Distance Charges 6361 non-null float64 14 Multiple Lines 6361 non-null object 15 Internet Service 7043 non-null object 16 Internet Type 5517 non-null object 17 Avg Monthly GB Download 5517 non-null float64 18 Online Security 5517 non-null object 19 Online Backup 5517 non-null object 20 Device Protection Plan 5517 non-null object 21 Premium Tech Support 5517 non-null object 22 Streaming TV 5517 non-null object 23 Streaming Movies 5517 non-null object 24 Streaming Music 5517 non-null object 25 Unlimited Data 5517 non-null object 26 Contract 7043 non-null object 27 Paperless Billing 7043 non-null object 28 Payment Method 7043 non-null object 29 Monthly Charge 7043 non-null float64 30 Total Charges 7043 non-null float64 31 Total Refunds 7043 non-null float64 32 Total Extra Data Charges 7043 non-null int64 33 Total Long Distance Charges 7043 non-null float64 34 Total Revenue 7043 non-null float64 35 Customer Status 7043 non-null object 36 Churn Category 1869 non-null object 37 Churn Reason 1869 non-null object dtypes: float64(9), int64(6), object(23) memory usage: 2.0+ MB
# Checking data types
Churn.dtypes
Customer ID object Gender object Age int64 Married object Number of Dependents int64 City object Zip Code int64 Latitude float64 Longitude float64 Number of Referrals int64 Tenure in Months int64 Offer object Phone Service object Avg Monthly Long Distance Charges float64 Multiple Lines object Internet Service object Internet Type object Avg Monthly GB Download float64 Online Security object Online Backup object Device Protection Plan object Premium Tech Support object Streaming TV object Streaming Movies object Streaming Music object Unlimited Data object Contract object Paperless Billing object Payment Method object Monthly Charge float64 Total Charges float64 Total Refunds float64 Total Extra Data Charges int64 Total Long Distance Charges float64 Total Revenue float64 Customer Status object Churn Category object Churn Reason object dtype: object
The joined observation would be removed from the target variable because the aim of the analysis is to predict telecom customer churn & not whether new customer joined. To predict the customer churn we would be using the "Stayed" or "churned".
Churn = Churn[Churn['Customer Status'] != 'Joined']
print(Churn['Customer Status'])
0 Stayed
1 Stayed
2 Churned
3 Churned
4 Churned
...
7037 Churned
7038 Stayed
7039 Churned
7041 Stayed
7042 Stayed
Name: Customer Status, Length: 6589, dtype: object
# Checking shape of datset after removing joined
Churn.shape
(6589, 38)
#Checking tail of dataset (last 10)
Churn.tail(n=10)
| Customer ID | Gender | Age | Married | Number of Dependents | City | Zip Code | Latitude | Longitude | Number of Referrals | ... | Payment Method | Monthly Charge | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Customer Status | Churn Category | Churn Reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7031 | 9974-JFBHQ | Male | 31 | No | 1 | Del Rey | 93616 | 36.657462 | -119.595293 | 0 | ... | Credit Card | 99.15 | 6171.20 | 0.0 | 0 | 1434.24 | 7605.44 | Stayed | NaN | NaN |
| 7032 | 9975-GPKZU | Male | 62 | Yes | 3 | Alameda | 94501 | 37.774633 | -122.274434 | 0 | ... | Credit Card | 19.75 | 856.50 | 0.0 | 0 | 352.82 | 1209.32 | Stayed | NaN | NaN |
| 7034 | 9978-HYCIN | Male | 72 | Yes | 1 | Bakersfield | 93301 | 35.383937 | -119.020428 | 1 | ... | Bank Withdrawal | 84.95 | 4018.05 | 0.0 | 80 | 1987.63 | 6085.68 | Stayed | NaN | NaN |
| 7035 | 9979-RGMZT | Female | 20 | No | 0 | Los Angeles | 90022 | 34.023810 | -118.156582 | 0 | ... | Credit Card | 94.05 | 633.45 | 0.0 | 0 | 255.43 | 888.88 | Stayed | NaN | NaN |
| 7036 | 9985-MWVIX | Female | 53 | No | 0 | Hume | 93628 | 36.807595 | -118.901544 | 0 | ... | Credit Card | 70.15 | 70.15 | 0.0 | 0 | 42.09 | 112.24 | Churned | Competitor | Competitor had better devices |
| 7037 | 9986-BONCE | Female | 36 | No | 0 | Fallbrook | 92028 | 33.362575 | -117.299644 | 0 | ... | Bank Withdrawal | 20.95 | 85.50 | 0.0 | 0 | 8.04 | 93.54 | Churned | Competitor | Competitor made better offer |
| 7038 | 9987-LUTYD | Female | 20 | No | 0 | La Mesa | 91941 | 32.759327 | -116.997260 | 0 | ... | Credit Card | 55.15 | 742.90 | 0.0 | 0 | 606.84 | 1349.74 | Stayed | NaN | NaN |
| 7039 | 9992-RRAMN | Male | 40 | Yes | 0 | Riverbank | 95367 | 37.734971 | -120.954271 | 1 | ... | Bank Withdrawal | 85.10 | 1873.70 | 0.0 | 0 | 356.40 | 2230.10 | Churned | Dissatisfaction | Product dissatisfaction |
| 7041 | 9993-LHIEB | Male | 21 | Yes | 0 | Solana Beach | 92075 | 33.001813 | -117.263628 | 5 | ... | Credit Card | 67.85 | 4627.65 | 0.0 | 0 | 142.04 | 4769.69 | Stayed | NaN | NaN |
| 7042 | 9995-HOTOH | Male | 36 | Yes | 0 | Sierra City | 96125 | 39.600599 | -120.636358 | 1 | ... | Bank Withdrawal | 59.00 | 3707.60 | 0.0 | 0 | 0.00 | 3707.60 | Stayed | NaN | NaN |
10 rows × 38 columns
categorical_columns = ['Gender','Married','City', 'Offer', 'Phone Service', 'Multiple Lines', 'Internet Service',
'Internet Type', 'Online Security', 'Number of Dependents', 'Number of Referrals', 'Online Backup',
'Device Protection Plan', 'Premium Tech Support', 'Streaming TV',
'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract','Total Extra Data Charges', 'Paperless Billing', 'Payment Method', 'Churn Category', 'Churn Reason', 'Customer Status']
numerical_columns = ['Age', 'Tenure in Months', 'Avg Monthly Long Distance Charges',
'Avg Monthly GB Download', 'Monthly Charge', 'Total Charges', 'Total Refunds',
'Total Long Distance Charges', 'Total Revenue']
Categorical = Churn[categorical_columns]
Numerical = Churn[numerical_columns]
#Checking head of datase
Categorical.head()
| Gender | Married | City | Offer | Phone Service | Multiple Lines | Internet Service | Internet Type | Online Security | Number of Dependents | ... | Streaming Movies | Streaming Music | Unlimited Data | Contract | Total Extra Data Charges | Paperless Billing | Payment Method | Churn Category | Churn Reason | Customer Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | Yes | Frazier Park | NaN | Yes | No | Yes | Cable | No | 0 | ... | No | No | Yes | One Year | 0 | Yes | Credit Card | NaN | NaN | Stayed |
| 1 | Male | No | Glendale | NaN | Yes | Yes | Yes | Cable | No | 0 | ... | Yes | Yes | No | Month-to-Month | 10 | No | Credit Card | NaN | NaN | Stayed |
| 2 | Male | No | Costa Mesa | Offer E | Yes | No | Yes | Fiber Optic | No | 0 | ... | No | No | Yes | Month-to-Month | 0 | Yes | Bank Withdrawal | Competitor | Competitor had better devices | Churned |
| 3 | Male | Yes | Martinez | Offer D | Yes | No | Yes | Fiber Optic | No | 0 | ... | Yes | No | Yes | Month-to-Month | 0 | Yes | Bank Withdrawal | Dissatisfaction | Product dissatisfaction | Churned |
| 4 | Female | Yes | Camarillo | NaN | Yes | No | Yes | Fiber Optic | No | 0 | ... | No | No | Yes | Month-to-Month | 0 | Yes | Credit Card | Dissatisfaction | Network reliability | Churned |
5 rows × 25 columns
#Checking head of dataset
Numerical.head()
| Age | Tenure in Months | Avg Monthly Long Distance Charges | Avg Monthly GB Download | Monthly Charge | Total Charges | Total Refunds | Total Long Distance Charges | Total Revenue | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 37 | 9 | 42.39 | 16.0 | 65.6 | 593.30 | 0.00 | 381.51 | 974.81 |
| 1 | 46 | 9 | 10.69 | 10.0 | -4.0 | 542.40 | 38.33 | 96.21 | 610.28 |
| 2 | 50 | 4 | 33.65 | 30.0 | 73.9 | 280.85 | 0.00 | 134.60 | 415.45 |
| 3 | 78 | 13 | 27.82 | 4.0 | 98.0 | 1237.85 | 0.00 | 361.66 | 1599.51 |
| 4 | 75 | 3 | 7.38 | 11.0 | 83.9 | 267.40 | 0.00 | 22.14 | 289.54 |
# Defining the number of rows and columns for the grid
num_cols = 5
num_rows = (len(Categorical.columns) - 2 + num_cols - 1) // num_cols
# Creating a grid of subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(5*num_cols, 5*num_rows))
# Flatten the axes array
axes = axes.flatten()
# Setting color palette
color = sns.set_palette("pastel")
# Columns to iterate over excluding "city" and "churn reason"
columns_to_plot = [col for col in Categorical.columns if col not in ["City", "Churn Reason"]]
# Iterating over each column in the DataFrame
for i, column in enumerate(columns_to_plot):
ax = axes[i] # Get the subplot axes
sns.countplot(x=column, data=Categorical, ax=ax, palette= color)
ax.tick_params(axis='x', rotation=45) # Rotate x-axis labels for better readability
# Remove empty subplots
for i in range(len(columns_to_plot), num_rows * num_cols):
fig.delaxes(axes[i])
# Adjust layout
plt.tight_layout()
# Save image to include in PDF
plt.savefig('Distribution_of_Categorical_Feature.jpg')
# Show the plot
plt.show()
# Setting color palette
sns.set_palette("husl", 9)
# Defining the number of rows and columns for the grid
num_cols = 5
num_rows = (len(Numerical.columns) + num_cols - 1) // num_cols
# Creating a grid of subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(5*num_cols, 5*num_rows))
# Flatten the axes array
axes = axes.flatten()
# Columns to iterate over excluding "Total Refunds"
columns_to_plot = [col for col in Numerical.columns if col != "Total Refunds"]
# Iterating over each column in the DataFrame
for i, column in enumerate(columns_to_plot):
ax = axes[i] # Get the subplot axes
# Plot boxplot for numerical columns
sns.boxplot(x=column, data=Numerical, ax=ax)
ax.set_title(f"Boxplot of {column}")
# Remove empty subplots
for i in range(len(columns_to_plot), num_rows * num_cols):
fig.delaxes(axes[i])
# Adjust layout
plt.tight_layout()
# Save image to include in PDF
plt.savefig('Boxplot_of_each_Numerical_Feature.jpg')
plt.show()
# Define the target variable
target_variable = 'Customer Status'
# Exclude columns from categorical_columns
columns_to_plot = [col for col in categorical_columns if col not in ['City', 'Churn Reason', 'Customer Status', 'Churn Category']]
# Calculate the number of columns for the subplot grid
num_cols = 3 # You may adjust this value as needed
# Calculate the actual number of rows needed
num_rows = (len(columns_to_plot) + num_cols - 1) // num_cols
# Create a grid of subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))
# Flatten the axes array
axes = axes.flatten()
# Iterate over each categorical column
for i, column in enumerate(columns_to_plot):
# Get the subplot axes
ax = axes[i]
# Plot the relationship using a bar plot
colour = ['pink','tan']
sns.set_palette(colour)
sns.countplot(x=column, hue=target_variable, data=Categorical, ax=ax)
# Set title
ax.set_title(f'{column} vs {target_variable}')
# Rotate x-axis labels for better readability
ax.tick_params(axis='x', rotation=45)
# Remove any remaining empty subplots
for j in range(len(columns_to_plot), num_rows * num_cols):
fig.delaxes(axes[j])
# Adjust layout
plt.tight_layout()
plt.savefig("Bivariate Analysis.jpg")
plt.show()
# Group by city and churn status
city_churn_counts = Churn.groupby(['City', 'Customer Status']).size().unstack().reset_index()
city_churn_counts.fillna(0, inplace=True) # Fill NaN values with 0 for cities with no churn data
# Create a map centered around a latitude and longitude
churn_map = folium.Map(location=[Churn['Latitude'].mean(), Churn['Longitude'].mean()], zoom_start=5)
# Add markers for each city
for index, row in city_churn_counts.iterrows():
City = row['City']
Churned = row['Churned'] if 'Churned' in row else 0
Stayed = row['Stayed'] if 'Stayed' in row else 0
marker_popup = f"{City}<br>Churned: {Churned}<br>Stayed: {Stayed}"
folium.Marker(
location=[Churn[Churn['City'] == City]['Latitude'].values[0], Churn[Churn['City'] == City]['Longitude'].values[0]],
popup=marker_popup,
icon=folium.Icon(color='red' if Churned > Stayed else 'green')
).add_to(churn_map)
# Add a tooltip to show city name and churned/stayed count when hovering over a marker
folium.Marker(
location=[0, 0], # Dummy location
icon=None,
tooltip=folium.Tooltip("<b>Click a marker to see details</b>")
).add_to(churn_map)
# Display the map
churn_map
#Removing unnecessary Columns
Churn.drop(["Customer ID", "Zip Code", "City", "Latitude", "Longitude", "Churn Reason", "Churn Category"], axis =1, inplace= True)
# Checking head of dataset after removing columns
Churn.head(n=30)
| Gender | Age | Married | Number of Dependents | Number of Referrals | Tenure in Months | Offer | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | ... | Contract | Paperless Billing | Payment Method | Monthly Charge | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Customer Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 37 | Yes | 0 | 2 | 9 | NaN | Yes | 42.39 | No | ... | One Year | Yes | Credit Card | 65.60 | 593.30 | 0.00 | 0 | 381.51 | 974.81 | Stayed |
| 1 | Male | 46 | No | 0 | 0 | 9 | NaN | Yes | 10.69 | Yes | ... | Month-to-Month | No | Credit Card | -4.00 | 542.40 | 38.33 | 10 | 96.21 | 610.28 | Stayed |
| 2 | Male | 50 | No | 0 | 0 | 4 | Offer E | Yes | 33.65 | No | ... | Month-to-Month | Yes | Bank Withdrawal | 73.90 | 280.85 | 0.00 | 0 | 134.60 | 415.45 | Churned |
| 3 | Male | 78 | Yes | 0 | 1 | 13 | Offer D | Yes | 27.82 | No | ... | Month-to-Month | Yes | Bank Withdrawal | 98.00 | 1237.85 | 0.00 | 0 | 361.66 | 1599.51 | Churned |
| 4 | Female | 75 | Yes | 0 | 3 | 3 | NaN | Yes | 7.38 | No | ... | Month-to-Month | Yes | Credit Card | 83.90 | 267.40 | 0.00 | 0 | 22.14 | 289.54 | Churned |
| 5 | Female | 23 | No | 3 | 0 | 9 | Offer E | Yes | 16.77 | No | ... | Month-to-Month | Yes | Credit Card | 69.40 | 571.45 | 0.00 | 0 | 150.93 | 722.38 | Stayed |
| 6 | Female | 67 | Yes | 0 | 1 | 71 | Offer A | Yes | 9.96 | No | ... | Two Year | Yes | Bank Withdrawal | 109.70 | 7904.25 | 0.00 | 0 | 707.16 | 8611.41 | Stayed |
| 7 | Male | 52 | Yes | 0 | 8 | 63 | Offer B | Yes | 12.96 | Yes | ... | Two Year | Yes | Credit Card | 84.65 | 5377.80 | 0.00 | 20 | 816.48 | 6214.28 | Stayed |
| 8 | Female | 68 | No | 0 | 0 | 7 | Offer E | Yes | 10.53 | No | ... | Two Year | Yes | Bank Withdrawal | 48.20 | 340.35 | 0.00 | 0 | 73.71 | 414.06 | Stayed |
| 9 | Female | 43 | Yes | 1 | 3 | 65 | NaN | Yes | 28.46 | Yes | ... | Two Year | Yes | Credit Card | 90.45 | 5957.90 | 0.00 | 0 | 1849.90 | 7807.80 | Stayed |
| 10 | Male | 47 | No | 0 | 0 | 54 | NaN | No | NaN | NaN | ... | Two Year | No | Credit Card | 45.20 | 2460.55 | 0.00 | 0 | 0.00 | 2460.55 | Stayed |
| 11 | Female | 25 | Yes | 2 | 2 | 72 | NaN | Yes | 16.01 | Yes | ... | Two Year | Yes | Credit Card | 116.80 | 8456.75 | 0.00 | 0 | 1152.72 | 9609.47 | Stayed |
| 12 | Female | 58 | Yes | 0 | 0 | 5 | NaN | Yes | 18.65 | No | ... | Month-to-Month | Yes | Bank Withdrawal | 68.95 | 351.50 | 0.00 | 0 | 93.25 | 444.75 | Stayed |
| 13 | Female | 32 | No | 0 | 0 | 72 | Offer A | Yes | 2.25 | Yes | ... | Two Year | Yes | Bank Withdrawal | 101.30 | 7261.25 | 0.00 | 0 | 162.00 | 7423.25 | Stayed |
| 14 | Female | 39 | No | 0 | 0 | 56 | NaN | No | NaN | NaN | ... | Two Year | No | Bank Withdrawal | 45.05 | 2560.10 | 0.00 | 0 | 0.00 | 2560.10 | Stayed |
| 15 | Female | 58 | Yes | 2 | 9 | 71 | Offer A | Yes | 27.26 | Yes | ... | Two Year | Yes | Credit Card | 95.75 | 6849.40 | 0.00 | 0 | 1935.46 | 8784.86 | Stayed |
| 16 | Female | 52 | Yes | 1 | 0 | 34 | NaN | No | NaN | NaN | ... | One Year | No | Credit Card | 61.25 | 1993.20 | 0.00 | 0 | 0.00 | 1993.20 | Stayed |
| 18 | Male | 79 | No | 0 | 0 | 45 | NaN | Yes | 10.67 | No | ... | One Year | No | Credit Card | 62.70 | 2791.50 | 0.00 | 0 | 480.15 | 3271.65 | Churned |
| 19 | Male | 67 | No | 0 | 0 | 1 | NaN | No | NaN | NaN | ... | Month-to-Month | Yes | Bank Withdrawal | 25.10 | 25.10 | 0.00 | 0 | 0.00 | 25.10 | Churned |
| 20 | Female | 79 | Yes | 0 | 0 | 50 | NaN | Yes | 31.43 | Yes | ... | One Year | No | Bank Withdrawal | 25.20 | 1306.30 | 0.00 | 0 | 1571.50 | 2877.80 | Stayed |
| 21 | Female | 26 | Yes | 0 | 1 | 13 | NaN | Yes | 43.56 | Yes | ... | Month-to-Month | No | Bank Withdrawal | 94.10 | 1215.60 | 0.00 | 0 | 566.28 | 1781.88 | Churned |
| 22 | Female | 30 | Yes | 2 | 1 | 23 | Offer D | Yes | 34.91 | Yes | ... | Month-to-Month | Yes | Bank Withdrawal | 83.75 | 1849.95 | 0.00 | 0 | 802.93 | 2652.88 | Stayed |
| 24 | Female | 34 | Yes | 0 | 0 | 4 | Offer E | Yes | 12.43 | No | ... | Month-to-Month | No | Credit Card | 20.35 | 76.35 | 0.00 | 0 | 49.72 | 126.07 | Churned |
| 25 | Female | 37 | Yes | 1 | 1 | 1 | NaN | No | NaN | NaN | ... | Month-to-Month | No | Bank Withdrawal | 30.50 | 30.50 | 0.00 | 0 | 0.00 | 30.50 | Churned |
| 26 | Female | 37 | Yes | 3 | 10 | 55 | NaN | Yes | 35.04 | No | ... | One Year | Yes | Bank Withdrawal | 103.70 | 5656.75 | 0.00 | 20 | 1927.20 | 7603.95 | Stayed |
| 27 | Male | 42 | Yes | 3 | 3 | 54 | NaN | Yes | 19.70 | No | ... | Two Year | No | Credit Card | 20.40 | 1090.60 | 0.00 | 0 | 1063.80 | 2154.40 | Stayed |
| 28 | Male | 64 | No | 0 | 0 | 26 | Offer C | Yes | 37.70 | No | ... | One Year | Yes | Bank Withdrawal | 19.60 | 471.85 | 21.25 | 0 | 980.20 | 1430.80 | Stayed |
| 29 | Female | 47 | Yes | 3 | 5 | 69 | NaN | Yes | 38.39 | No | ... | Two Year | No | Bank Withdrawal | 19.70 | 1396.90 | 0.00 | 0 | 2648.91 | 4045.81 | Stayed |
| 30 | Male | 23 | Yes | 3 | 1 | 37 | Offer C | Yes | 43.01 | No | ... | One Year | No | Credit Card | 91.20 | 3247.55 | 0.00 | 0 | 1591.37 | 4838.92 | Stayed |
| 31 | Female | 48 | Yes | 0 | 5 | 49 | Offer B | Yes | 49.58 | No | ... | One Year | No | Bank Withdrawal | 20.45 | 900.90 | 0.00 | 0 | 2429.42 | 3330.32 | Stayed |
30 rows × 31 columns
#Checking number of null in each column
Churn.isnull().sum()
Gender 0 Age 0 Married 0 Number of Dependents 0 Number of Referrals 0 Tenure in Months 0 Offer 3598 Phone Service 0 Avg Monthly Long Distance Charges 644 Multiple Lines 644 Internet Service 0 Internet Type 1344 Avg Monthly GB Download 1344 Online Security 1344 Online Backup 1344 Device Protection Plan 1344 Premium Tech Support 1344 Streaming TV 1344 Streaming Movies 1344 Streaming Music 1344 Unlimited Data 1344 Contract 0 Paperless Billing 0 Payment Method 0 Monthly Charge 0 Total Charges 0 Total Refunds 0 Total Extra Data Charges 0 Total Long Distance Charges 0 Total Revenue 0 Customer Status 0 dtype: int64
# Calculate total number of rows
total_rows = Churn.shape[0]
# Calculate number of missing values in each column
missing_values = Churn.isnull().sum()
# Calculate percentage of missing values in each column
percentage_missing = (missing_values / total_rows) * 100
# Display the percentage of missing values in each column
print("Percentage of missing values in each column:")
print(percentage_missing)
Percentage of missing values in each column: Gender 0.000000 Age 0.000000 Married 0.000000 Number of Dependents 0.000000 Number of Referrals 0.000000 Tenure in Months 0.000000 Offer 54.606162 Phone Service 0.000000 Avg Monthly Long Distance Charges 9.773866 Multiple Lines 9.773866 Internet Service 0.000000 Internet Type 20.397632 Avg Monthly GB Download 20.397632 Online Security 20.397632 Online Backup 20.397632 Device Protection Plan 20.397632 Premium Tech Support 20.397632 Streaming TV 20.397632 Streaming Movies 20.397632 Streaming Music 20.397632 Unlimited Data 20.397632 Contract 0.000000 Paperless Billing 0.000000 Payment Method 0.000000 Monthly Charge 0.000000 Total Charges 0.000000 Total Refunds 0.000000 Total Extra Data Charges 0.000000 Total Long Distance Charges 0.000000 Total Revenue 0.000000 Customer Status 0.000000 dtype: float64
#Removing unnecessary Columns
Churn.drop(["Offer"], axis =1, inplace= True)
# Replacing missing values in numerical columns with the median
numerical_columns = ['Avg Monthly Long Distance Charges', 'Avg Monthly GB Download']
for column in numerical_columns:
median_value = Churn[column].median()
Churn[column].fillna(median_value, inplace=True)
# Replacing missing values in categorical columns with the mode
categorical_columns = ['Internet Type', 'Online Security', 'Multiple Lines', 'Internet Type', 'Online Security',
'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
'Streaming Music', 'Unlimited Data']
for column in categorical_columns:
mode_value = Churn[column].mode()[0]
Churn[column].fillna(mode_value, inplace=True)
#Checking number of null in each column after replacing missing values
Churn.isnull().sum()
Gender 0 Age 0 Married 0 Number of Dependents 0 Number of Referrals 0 Tenure in Months 0 Phone Service 0 Avg Monthly Long Distance Charges 0 Multiple Lines 0 Internet Service 0 Internet Type 0 Avg Monthly GB Download 0 Online Security 0 Online Backup 0 Device Protection Plan 0 Premium Tech Support 0 Streaming TV 0 Streaming Movies 0 Streaming Music 0 Unlimited Data 0 Contract 0 Paperless Billing 0 Payment Method 0 Monthly Charge 0 Total Charges 0 Total Refunds 0 Total Extra Data Charges 0 Total Long Distance Charges 0 Total Revenue 0 Customer Status 0 dtype: int64
Churn.duplicated().sum()
0
# Checking head of dataset after replacing NaN Values
Churn.head(n=30)
| Gender | Age | Married | Number of Dependents | Number of Referrals | Tenure in Months | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | ... | Contract | Paperless Billing | Payment Method | Monthly Charge | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Customer Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 37 | Yes | 0 | 2 | 9 | Yes | 42.39 | No | Yes | ... | One Year | Yes | Credit Card | 65.60 | 593.30 | 0.00 | 0 | 381.51 | 974.81 | Stayed |
| 1 | Male | 46 | No | 0 | 0 | 9 | Yes | 10.69 | Yes | Yes | ... | Month-to-Month | No | Credit Card | -4.00 | 542.40 | 38.33 | 10 | 96.21 | 610.28 | Stayed |
| 2 | Male | 50 | No | 0 | 0 | 4 | Yes | 33.65 | No | Yes | ... | Month-to-Month | Yes | Bank Withdrawal | 73.90 | 280.85 | 0.00 | 0 | 134.60 | 415.45 | Churned |
| 3 | Male | 78 | Yes | 0 | 1 | 13 | Yes | 27.82 | No | Yes | ... | Month-to-Month | Yes | Bank Withdrawal | 98.00 | 1237.85 | 0.00 | 0 | 361.66 | 1599.51 | Churned |
| 4 | Female | 75 | Yes | 0 | 3 | 3 | Yes | 7.38 | No | Yes | ... | Month-to-Month | Yes | Credit Card | 83.90 | 267.40 | 0.00 | 0 | 22.14 | 289.54 | Churned |
| 5 | Female | 23 | No | 3 | 0 | 9 | Yes | 16.77 | No | Yes | ... | Month-to-Month | Yes | Credit Card | 69.40 | 571.45 | 0.00 | 0 | 150.93 | 722.38 | Stayed |
| 6 | Female | 67 | Yes | 0 | 1 | 71 | Yes | 9.96 | No | Yes | ... | Two Year | Yes | Bank Withdrawal | 109.70 | 7904.25 | 0.00 | 0 | 707.16 | 8611.41 | Stayed |
| 7 | Male | 52 | Yes | 0 | 8 | 63 | Yes | 12.96 | Yes | Yes | ... | Two Year | Yes | Credit Card | 84.65 | 5377.80 | 0.00 | 20 | 816.48 | 6214.28 | Stayed |
| 8 | Female | 68 | No | 0 | 0 | 7 | Yes | 10.53 | No | Yes | ... | Two Year | Yes | Bank Withdrawal | 48.20 | 340.35 | 0.00 | 0 | 73.71 | 414.06 | Stayed |
| 9 | Female | 43 | Yes | 1 | 3 | 65 | Yes | 28.46 | Yes | Yes | ... | Two Year | Yes | Credit Card | 90.45 | 5957.90 | 0.00 | 0 | 1849.90 | 7807.80 | Stayed |
| 10 | Male | 47 | No | 0 | 0 | 54 | No | 25.72 | No | Yes | ... | Two Year | No | Credit Card | 45.20 | 2460.55 | 0.00 | 0 | 0.00 | 2460.55 | Stayed |
| 11 | Female | 25 | Yes | 2 | 2 | 72 | Yes | 16.01 | Yes | Yes | ... | Two Year | Yes | Credit Card | 116.80 | 8456.75 | 0.00 | 0 | 1152.72 | 9609.47 | Stayed |
| 12 | Female | 58 | Yes | 0 | 0 | 5 | Yes | 18.65 | No | Yes | ... | Month-to-Month | Yes | Bank Withdrawal | 68.95 | 351.50 | 0.00 | 0 | 93.25 | 444.75 | Stayed |
| 13 | Female | 32 | No | 0 | 0 | 72 | Yes | 2.25 | Yes | Yes | ... | Two Year | Yes | Bank Withdrawal | 101.30 | 7261.25 | 0.00 | 0 | 162.00 | 7423.25 | Stayed |
| 14 | Female | 39 | No | 0 | 0 | 56 | No | 25.72 | No | Yes | ... | Two Year | No | Bank Withdrawal | 45.05 | 2560.10 | 0.00 | 0 | 0.00 | 2560.10 | Stayed |
| 15 | Female | 58 | Yes | 2 | 9 | 71 | Yes | 27.26 | Yes | Yes | ... | Two Year | Yes | Credit Card | 95.75 | 6849.40 | 0.00 | 0 | 1935.46 | 8784.86 | Stayed |
| 16 | Female | 52 | Yes | 1 | 0 | 34 | No | 25.72 | No | Yes | ... | One Year | No | Credit Card | 61.25 | 1993.20 | 0.00 | 0 | 0.00 | 1993.20 | Stayed |
| 18 | Male | 79 | No | 0 | 0 | 45 | Yes | 10.67 | No | Yes | ... | One Year | No | Credit Card | 62.70 | 2791.50 | 0.00 | 0 | 480.15 | 3271.65 | Churned |
| 19 | Male | 67 | No | 0 | 0 | 1 | No | 25.72 | No | Yes | ... | Month-to-Month | Yes | Bank Withdrawal | 25.10 | 25.10 | 0.00 | 0 | 0.00 | 25.10 | Churned |
| 20 | Female | 79 | Yes | 0 | 0 | 50 | Yes | 31.43 | Yes | No | ... | One Year | No | Bank Withdrawal | 25.20 | 1306.30 | 0.00 | 0 | 1571.50 | 2877.80 | Stayed |
| 21 | Female | 26 | Yes | 0 | 1 | 13 | Yes | 43.56 | Yes | Yes | ... | Month-to-Month | No | Bank Withdrawal | 94.10 | 1215.60 | 0.00 | 0 | 566.28 | 1781.88 | Churned |
| 22 | Female | 30 | Yes | 2 | 1 | 23 | Yes | 34.91 | Yes | Yes | ... | Month-to-Month | Yes | Bank Withdrawal | 83.75 | 1849.95 | 0.00 | 0 | 802.93 | 2652.88 | Stayed |
| 24 | Female | 34 | Yes | 0 | 0 | 4 | Yes | 12.43 | No | No | ... | Month-to-Month | No | Credit Card | 20.35 | 76.35 | 0.00 | 0 | 49.72 | 126.07 | Churned |
| 25 | Female | 37 | Yes | 1 | 1 | 1 | No | 25.72 | No | Yes | ... | Month-to-Month | No | Bank Withdrawal | 30.50 | 30.50 | 0.00 | 0 | 0.00 | 30.50 | Churned |
| 26 | Female | 37 | Yes | 3 | 10 | 55 | Yes | 35.04 | No | Yes | ... | One Year | Yes | Bank Withdrawal | 103.70 | 5656.75 | 0.00 | 20 | 1927.20 | 7603.95 | Stayed |
| 27 | Male | 42 | Yes | 3 | 3 | 54 | Yes | 19.70 | No | No | ... | Two Year | No | Credit Card | 20.40 | 1090.60 | 0.00 | 0 | 1063.80 | 2154.40 | Stayed |
| 28 | Male | 64 | No | 0 | 0 | 26 | Yes | 37.70 | No | No | ... | One Year | Yes | Bank Withdrawal | 19.60 | 471.85 | 21.25 | 0 | 980.20 | 1430.80 | Stayed |
| 29 | Female | 47 | Yes | 3 | 5 | 69 | Yes | 38.39 | No | No | ... | Two Year | No | Bank Withdrawal | 19.70 | 1396.90 | 0.00 | 0 | 2648.91 | 4045.81 | Stayed |
| 30 | Male | 23 | Yes | 3 | 1 | 37 | Yes | 43.01 | No | Yes | ... | One Year | No | Credit Card | 91.20 | 3247.55 | 0.00 | 0 | 1591.37 | 4838.92 | Stayed |
| 31 | Female | 48 | Yes | 0 | 5 | 49 | Yes | 49.58 | No | No | ... | One Year | No | Bank Withdrawal | 20.45 | 900.90 | 0.00 | 0 | 2429.42 | 3330.32 | Stayed |
30 rows × 30 columns
# Checking data types before encoding
Churn.dtypes
Gender object Age int64 Married object Number of Dependents int64 Number of Referrals int64 Tenure in Months int64 Phone Service object Avg Monthly Long Distance Charges float64 Multiple Lines object Internet Service object Internet Type object Avg Monthly GB Download float64 Online Security object Online Backup object Device Protection Plan object Premium Tech Support object Streaming TV object Streaming Movies object Streaming Music object Unlimited Data object Contract object Paperless Billing object Payment Method object Monthly Charge float64 Total Charges float64 Total Refunds float64 Total Extra Data Charges int64 Total Long Distance Charges float64 Total Revenue float64 Customer Status object dtype: object
# Convert categorical columns (with 2 unique values) to numerical
# List of categorical columns excluding specified columns
categorical_columns = [col for col in Churn.select_dtypes(include=['object']).columns
if col not in ['Internet Type', 'Contract', 'Payment Method']]
# Convert categorical columns to numerical using Label Encoding
for column in categorical_columns:
le = LabelEncoder()
Churn[column] = le.fit_transform(Churn[column])
# Checking head of dataset after Encoding features with 2 unique values
Churn.head()
| Gender | Age | Married | Number of Dependents | Number of Referrals | Tenure in Months | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | ... | Contract | Paperless Billing | Payment Method | Monthly Charge | Total Charges | Total Refunds | Total Extra Data Charges | Total Long Distance Charges | Total Revenue | Customer Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 37 | 1 | 0 | 2 | 9 | 1 | 42.39 | 0 | 1 | ... | One Year | 1 | Credit Card | 65.6 | 593.30 | 0.00 | 0 | 381.51 | 974.81 | 1 |
| 1 | 1 | 46 | 0 | 0 | 0 | 9 | 1 | 10.69 | 1 | 1 | ... | Month-to-Month | 0 | Credit Card | -4.0 | 542.40 | 38.33 | 10 | 96.21 | 610.28 | 1 |
| 2 | 1 | 50 | 0 | 0 | 0 | 4 | 1 | 33.65 | 0 | 1 | ... | Month-to-Month | 1 | Bank Withdrawal | 73.9 | 280.85 | 0.00 | 0 | 134.60 | 415.45 | 0 |
| 3 | 1 | 78 | 1 | 0 | 1 | 13 | 1 | 27.82 | 0 | 1 | ... | Month-to-Month | 1 | Bank Withdrawal | 98.0 | 1237.85 | 0.00 | 0 | 361.66 | 1599.51 | 0 |
| 4 | 0 | 75 | 1 | 0 | 3 | 3 | 1 | 7.38 | 0 | 1 | ... | Month-to-Month | 1 | Credit Card | 83.9 | 267.40 | 0.00 | 0 | 22.14 | 289.54 | 0 |
5 rows × 30 columns
# Extract the columns you want to encode
columns_to_encode = ['Internet Type', 'Contract', 'Payment Method']
# Apply One-Hot Encoding
encoded_columns = pd.get_dummies(Churn[columns_to_encode])
# Drop the original categorical columns from the DataFrame
Churn = Churn.drop(columns_to_encode, axis=1)
# Concatenate the encoded columns with the original DataFrame
Churn = pd.concat([Churn, encoded_columns], axis=1)
# Checking head of dataset after Encoding
Churn.head(n=10)
| Gender | Age | Married | Number of Dependents | Number of Referrals | Tenure in Months | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | ... | Customer Status | Internet Type_Cable | Internet Type_DSL | Internet Type_Fiber Optic | Contract_Month-to-Month | Contract_One Year | Contract_Two Year | Payment Method_Bank Withdrawal | Payment Method_Credit Card | Payment Method_Mailed Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 37 | 1 | 0 | 2 | 9 | 1 | 42.39 | 0 | 1 | ... | 1 | True | False | False | False | True | False | False | True | False |
| 1 | 1 | 46 | 0 | 0 | 0 | 9 | 1 | 10.69 | 1 | 1 | ... | 1 | True | False | False | True | False | False | False | True | False |
| 2 | 1 | 50 | 0 | 0 | 0 | 4 | 1 | 33.65 | 0 | 1 | ... | 0 | False | False | True | True | False | False | True | False | False |
| 3 | 1 | 78 | 1 | 0 | 1 | 13 | 1 | 27.82 | 0 | 1 | ... | 0 | False | False | True | True | False | False | True | False | False |
| 4 | 0 | 75 | 1 | 0 | 3 | 3 | 1 | 7.38 | 0 | 1 | ... | 0 | False | False | True | True | False | False | False | True | False |
| 5 | 0 | 23 | 0 | 3 | 0 | 9 | 1 | 16.77 | 0 | 1 | ... | 1 | True | False | False | True | False | False | False | True | False |
| 6 | 0 | 67 | 1 | 0 | 1 | 71 | 1 | 9.96 | 0 | 1 | ... | 1 | False | False | True | False | False | True | True | False | False |
| 7 | 1 | 52 | 1 | 0 | 8 | 63 | 1 | 12.96 | 1 | 1 | ... | 1 | False | False | True | False | False | True | False | True | False |
| 8 | 0 | 68 | 0 | 0 | 0 | 7 | 1 | 10.53 | 0 | 1 | ... | 1 | False | True | False | False | False | True | True | False | False |
| 9 | 0 | 43 | 1 | 1 | 3 | 65 | 1 | 28.46 | 1 | 1 | ... | 1 | True | False | False | False | False | True | False | True | False |
10 rows × 36 columns
# Checking data types after encoding
Churn.dtypes
Gender int32 Age int64 Married int32 Number of Dependents int64 Number of Referrals int64 Tenure in Months int64 Phone Service int32 Avg Monthly Long Distance Charges float64 Multiple Lines int32 Internet Service int32 Avg Monthly GB Download float64 Online Security int32 Online Backup int32 Device Protection Plan int32 Premium Tech Support int32 Streaming TV int32 Streaming Movies int32 Streaming Music int32 Unlimited Data int32 Paperless Billing int32 Monthly Charge float64 Total Charges float64 Total Refunds float64 Total Extra Data Charges int64 Total Long Distance Charges float64 Total Revenue float64 Customer Status int32 Internet Type_Cable bool Internet Type_DSL bool Internet Type_Fiber Optic bool Contract_Month-to-Month bool Contract_One Year bool Contract_Two Year bool Payment Method_Bank Withdrawal bool Payment Method_Credit Card bool Payment Method_Mailed Check bool dtype: object
# Select boolean columns
boolean_columns = Churn.select_dtypes(include='bool').columns
# Convert boolean columns to integers (0 and 1)
Churn[boolean_columns] = Churn[boolean_columns].astype(int)
# Checking head of dataset after Encoding
Churn.head(n=10)
| Gender | Age | Married | Number of Dependents | Number of Referrals | Tenure in Months | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | ... | Customer Status | Internet Type_Cable | Internet Type_DSL | Internet Type_Fiber Optic | Contract_Month-to-Month | Contract_One Year | Contract_Two Year | Payment Method_Bank Withdrawal | Payment Method_Credit Card | Payment Method_Mailed Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 37 | 1 | 0 | 2 | 9 | 1 | 42.39 | 0 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 1 | 46 | 0 | 0 | 0 | 9 | 1 | 10.69 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 1 | 50 | 0 | 0 | 0 | 4 | 1 | 33.65 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 1 | 78 | 1 | 0 | 1 | 13 | 1 | 27.82 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4 | 0 | 75 | 1 | 0 | 3 | 3 | 1 | 7.38 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
| 5 | 0 | 23 | 0 | 3 | 0 | 9 | 1 | 16.77 | 0 | 1 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 6 | 0 | 67 | 1 | 0 | 1 | 71 | 1 | 9.96 | 0 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
| 7 | 1 | 52 | 1 | 0 | 8 | 63 | 1 | 12.96 | 1 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 8 | 0 | 68 | 0 | 0 | 0 | 7 | 1 | 10.53 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 9 | 0 | 43 | 1 | 1 | 3 | 65 | 1 | 28.46 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
10 rows × 36 columns
# Select the features to be normalized
features_to_normalize = ['Avg Monthly Long Distance Charges', 'Tenure in Months', 'Age', 'Number of Referrals', 'Number of Dependents']
# Initialize the MinMaxScaler
scaler = MinMaxScaler()
# Apply Min-Max scaling to the selected features
Churn[features_to_normalize] = scaler.fit_transform(Churn[features_to_normalize])
# Checking head of dataset after normalisation
Churn.head(n=10)
| Gender | Age | Married | Number of Dependents | Number of Referrals | Tenure in Months | Phone Service | Avg Monthly Long Distance Charges | Multiple Lines | Internet Service | ... | Customer Status | Internet Type_Cable | Internet Type_DSL | Internet Type_Fiber Optic | Contract_Month-to-Month | Contract_One Year | Contract_Two Year | Payment Method_Bank Withdrawal | Payment Method_Credit Card | Payment Method_Mailed Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.295082 | 1 | 0.000000 | 0.181818 | 0.112676 | 1 | 0.844835 | 0 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 1 | 0.442623 | 0 | 0.000000 | 0.000000 | 0.112676 | 1 | 0.197632 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 1 | 0.508197 | 0 | 0.000000 | 0.000000 | 0.042254 | 1 | 0.666394 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 1 | 0.967213 | 1 | 0.000000 | 0.090909 | 0.169014 | 1 | 0.547366 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4 | 0 | 0.918033 | 1 | 0.000000 | 0.272727 | 0.028169 | 1 | 0.130053 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
| 5 | 0 | 0.065574 | 0 | 0.333333 | 0.000000 | 0.112676 | 1 | 0.321764 | 0 | 1 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 6 | 0 | 0.786885 | 1 | 0.000000 | 0.090909 | 0.985915 | 1 | 0.182728 | 0 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
| 7 | 1 | 0.540984 | 1 | 0.000000 | 0.727273 | 0.873239 | 1 | 0.243977 | 1 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 8 | 0 | 0.803279 | 0 | 0.000000 | 0.000000 | 0.084507 | 1 | 0.194365 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 9 | 0 | 0.393443 | 1 | 0.111111 | 0.272727 | 0.901408 | 1 | 0.560433 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
10 rows × 36 columns
# Correlation Analysis
# Selecting only numerical features (excluding the target variable)
numerical_features = Churn.select_dtypes(include='number').drop(columns=['Customer Status'])
# Compute correlation matrix
correlation_matrix = numerical_features.corr()
# Display correlation matrix
print(correlation_matrix)
Gender Age Married \
Gender 1.000000 0.007664 0.004971
Age 0.007664 1.000000 -0.014409
Married 0.004971 -0.014409 1.000000
Number of Dependents 0.006321 -0.125500 0.322628
Number of Referrals -0.004353 -0.032019 0.663598
Tenure in Months 0.011547 -0.009574 0.359737
Phone Service -0.010518 0.007853 0.023813
Avg Monthly Long Distance Charges 0.015720 -0.015403 -0.004581
Multiple Lines -0.004748 0.098179 0.123751
Internet Service -0.002584 0.110758 -0.021248
Avg Monthly GB Download -0.014879 -0.505260 0.066664
Online Security -0.017059 -0.046073 0.132193
Online Backup -0.009362 0.028884 0.125829
Device Protection Plan -0.001482 0.020439 0.136040
Premium Tech Support -0.012419 -0.052058 0.109226
Streaming TV -0.001634 -0.037936 0.122905
Streaming Movies -0.009462 -0.024728 0.117148
Streaming Music -0.008470 -0.170316 0.071881
Unlimited Data 0.004778 -0.025313 0.029683
Paperless Billing -0.010891 0.100351 -0.032990
Monthly Charge -0.017288 0.124422 0.070009
Total Charges 0.004251 0.047788 0.296830
Total Refunds 0.008031 0.020570 0.031884
Total Extra Data Charges -0.000064 0.021759 0.012018
Total Long Distance Charges 0.015564 -0.009976 0.243303
Total Revenue 0.008006 0.035080 0.308360
Internet Type_Cable -0.008537 -0.041248 0.006825
Internet Type_DSL 0.014655 -0.056854 0.003602
Internet Type_Fiber Optic -0.007227 0.078216 -0.007797
Contract_Month-to-Month -0.005772 -0.001283 -0.252296
Contract_One Year 0.008596 -0.010480 0.069478
Contract_Two Year -0.001647 0.011245 0.214997
Payment Method_Bank Withdrawal -0.005451 0.114948 -0.003856
Payment Method_Credit Card 0.007691 -0.101897 0.033121
Payment Method_Mailed Check -0.004661 -0.033586 -0.063845
Number of Dependents Number of Referrals \
Gender 0.006321 -0.004353
Age -0.125500 -0.032019
Married 0.322628 0.663598
Number of Dependents 1.000000 0.276880
Number of Referrals 0.276880 1.000000
Tenure in Months 0.107311 0.323826
Phone Service -0.008652 0.012590
Avg Monthly Long Distance Charges -0.005649 -0.003455
Multiple Lines -0.035975 0.067451
Internet Service -0.164563 -0.051425
Avg Monthly GB Download 0.229396 0.064592
Online Security 0.036838 0.137056
Online Backup -0.005134 0.104668
Device Protection Plan -0.024079 0.108669
Premium Tech Support 0.019176 0.104969
Streaming TV 0.076270 0.110200
Streaming Movies 0.061325 0.089334
Streaming Music -0.039533 0.042108
Unlimited Data 0.029913 0.042395
Paperless Billing -0.112755 -0.056960
Monthly Charge -0.136297 0.011015
Total Charges 0.016118 0.241943
Total Refunds 0.012211 0.020372
Total Extra Data Charges -0.015797 -0.002547
Total Long Distance Charges 0.065869 0.207028
Total Revenue 0.032315 0.253851
Internet Type_Cable 0.006085 0.009476
Internet Type_DSL 0.010465 0.027511
Internet Type_Fiber Optic -0.013380 -0.030775
Contract_Month-to-Month -0.128389 -0.272251
Contract_One Year 0.004264 0.066514
Contract_Two Year 0.138544 0.239930
Payment Method_Bank Withdrawal -0.089444 -0.053171
Payment Method_Credit Card 0.099719 0.077497
Payment Method_Mailed Check -0.018555 -0.050879
Tenure in Months Phone Service \
Gender 0.011547 -0.010518
Age -0.009574 0.007853
Married 0.359737 0.023813
Number of Dependents 0.107311 -0.008652
Number of Referrals 0.323826 0.012590
Tenure in Months 1.000000 0.013282
Phone Service 0.013282 1.000000
Avg Monthly Long Distance Charges 0.006230 -0.004927
Multiple Lines 0.299944 0.294161
Internet Service -0.000288 -0.166607
Avg Monthly GB Download 0.034121 -0.032934
Online Security 0.315726 -0.097259
Online Backup 0.346104 -0.048827
Device Protection Plan 0.336471 -0.073776
Premium Tech Support 0.312314 -0.093205
Streaming TV 0.251734 0.113186
Streaming Movies 0.258661 0.103364
Streaming Music 0.211155 -0.040135
Unlimited Data -0.003243 0.041242
Paperless Billing -0.024360 0.012967
Monthly Charge 0.199019 0.243944
Total Charges 0.812795 0.123390
Total Refunds 0.042729 0.006675
Total Extra Data Charges 0.071224 -0.029174
Total Long Distance Charges 0.653170 0.307688
Total Revenue 0.840935 0.189668
Internet Type_Cable -0.005228 -0.224363
Internet Type_DSL 0.016244 -0.334472
Internet Type_Fiber Optic -0.010868 0.447803
Contract_Month-to-Month -0.614856 -0.002087
Contract_One Year 0.168878 0.000181
Contract_Two Year 0.524372 0.002147
Payment Method_Bank Withdrawal -0.025183 0.009661
Payment Method_Credit Card 0.084859 0.004319
Payment Method_Mailed Check -0.129430 -0.031003
Avg Monthly Long Distance Charges \
Gender 0.015720
Age -0.015403
Married -0.004581
Number of Dependents -0.005649
Number of Referrals -0.003455
Tenure in Months 0.006230
Phone Service -0.004927
Avg Monthly Long Distance Charges 1.000000
Multiple Lines -0.005364
Internet Service 0.011725
Avg Monthly GB Download 0.013169
Online Security 0.014592
Online Backup 0.010468
Device Protection Plan -0.001754
Premium Tech Support -0.005476
Streaming TV 0.000106
Streaming Movies 0.009270
Streaming Music 0.028029
Unlimited Data -0.012503
Paperless Billing 0.015637
Monthly Charge 0.012426
Total Charges 0.011565
Total Refunds -0.024660
Total Extra Data Charges 0.019051
Total Long Distance Charges 0.548711
Total Revenue 0.173532
Internet Type_Cable 0.014270
Internet Type_DSL -0.008801
Internet Type_Fiber Optic -0.001829
Contract_Month-to-Month -0.002238
Contract_One Year 0.030331
Contract_Two Year -0.025937
Payment Method_Bank Withdrawal 0.013207
Payment Method_Credit Card -0.010261
Payment Method_Mailed Check -0.007024
Multiple Lines Internet Service ... \
Gender -0.004748 -0.002584 ...
Age 0.098179 0.110758 ...
Married 0.123751 -0.021248 ...
Number of Dependents -0.035975 -0.164563 ...
Number of Referrals 0.067451 -0.051425 ...
Tenure in Months 0.299944 -0.000288 ...
Phone Service 0.294161 -0.166607 ...
Avg Monthly Long Distance Charges -0.005364 0.011725 ...
Multiple Lines 1.000000 0.199997 ...
Internet Service 0.199997 1.000000 ...
Avg Monthly GB Download -0.001234 0.119774 ...
Online Security 0.083926 0.330946 ...
Online Backup 0.187246 0.380024 ...
Device Protection Plan 0.184029 0.381903 ...
Premium Tech Support 0.087188 0.333822 ...
Streaming TV 0.076823 -0.406993 ...
Streaming Movies 0.081905 -0.403762 ...
Streaming Music 0.180489 0.387690 ...
Unlimited Data -0.037594 -0.177853 ...
Paperless Billing 0.154991 0.318405 ...
Monthly Charge 0.458214 0.721391 ...
Total Charges 0.451265 0.376125 ...
Total Refunds 0.036225 0.012650 ...
Total Extra Data Charges 0.057678 0.140662 ...
Total Long Distance Charges 0.300490 -0.042382 ...
Total Revenue 0.448428 0.287064 ...
Internet Type_Cable -0.101236 0.184681 ...
Internet Type_DSL -0.149177 0.279211 ...
Internet Type_Fiber Optic 0.200512 -0.372054 ...
Contract_Month-to-Month -0.070235 0.223913 ...
Contract_One Year -0.020749 -0.035474 ...
Contract_Two Year 0.097419 -0.215349 ...
Payment Method_Bank Withdrawal 0.132185 0.259444 ...
Payment Method_Credit Card -0.093775 -0.225085 ...
Payment Method_Mailed Check -0.089814 -0.086532 ...
Total Revenue Internet Type_Cable \
Gender 0.008006 -0.008537
Age 0.035080 -0.041248
Married 0.308360 0.006825
Number of Dependents 0.032315 0.006085
Number of Referrals 0.253851 0.009476
Tenure in Months 0.840935 -0.005228
Phone Service 0.189668 -0.224363
Avg Monthly Long Distance Charges 0.173532 0.014270
Multiple Lines 0.448428 -0.101236
Internet Service 0.287064 0.184681
Avg Monthly GB Download 0.063804 0.057459
Online Security 0.377673 0.148439
Online Backup 0.468395 0.076588
Device Protection Plan 0.461121 0.069854
Premium Tech Support 0.389047 0.151191
Streaming TV 0.213616 -0.129448
Streaming Movies 0.220091 -0.134447
Streaming Music 0.380614 0.041838
Unlimited Data -0.057592 -0.034594
Paperless Billing 0.116670 -0.015241
Monthly Charge 0.554815 -0.058105
Total Charges 0.970266 -0.032446
Total Refunds 0.022574 -0.012800
Total Extra Data Charges 0.114636 0.011158
Total Long Distance Charges 0.765651 -0.070568
Total Revenue 1.000000 -0.046718
Internet Type_Cable -0.046718 1.000000
Internet Type_DSL -0.054856 -0.201234
Internet Type_Fiber Optic 0.080137 -0.496382
Contract_Month-to-Month -0.451532 -0.006728
Contract_One Year 0.155290 0.010886
Contract_Two Year 0.355781 -0.002732
Payment Method_Bank Withdrawal 0.062803 -0.035113
Payment Method_Credit Card -0.001403 0.012819
Payment Method_Mailed Check -0.137061 0.050307
Internet Type_DSL \
Gender 0.014655
Age -0.056854
Married 0.003602
Number of Dependents 0.010465
Number of Referrals 0.027511
Tenure in Months 0.016244
Phone Service -0.334472
Avg Monthly Long Distance Charges -0.008801
Multiple Lines -0.149177
Internet Service 0.279211
Avg Monthly GB Download 0.074321
Online Security 0.249760
Online Backup 0.121076
Device Protection Plan 0.120539
Premium Tech Support 0.236722
Streaming TV -0.219388
Streaming Movies -0.208604
Streaming Music 0.060038
Unlimited Data -0.057505
Paperless Billing -0.058684
Monthly Charge -0.129892
Total Charges -0.036622
Total Refunds 0.006062
Total Extra Data Charges 0.057450
Total Long Distance Charges -0.087915
Total Revenue -0.054856
Internet Type_Cable -0.201234
Internet Type_DSL 1.000000
Internet Type_Fiber Optic -0.750457
Contract_Month-to-Month -0.047333
Contract_One Year 0.040011
Contract_Two Year 0.015057
Payment Method_Bank Withdrawal -0.056924
Payment Method_Credit Card 0.053642
Payment Method_Mailed Check 0.009676
Internet Type_Fiber Optic \
Gender -0.007227
Age 0.078216
Married -0.007797
Number of Dependents -0.013380
Number of Referrals -0.030775
Tenure in Months -0.010868
Phone Service 0.447803
Avg Monthly Long Distance Charges -0.001829
Multiple Lines 0.200512
Internet Service -0.372054
Avg Monthly GB Download -0.104635
Online Security -0.321500
Online Backup -0.158977
Device Protection Plan -0.153957
Premium Tech Support -0.311804
Streaming TV 0.281770
Streaming Movies 0.275586
Streaming Music -0.081437
Unlimited Data 0.074305
Paperless Billing 0.062292
Monthly Charge 0.154319
Total Charges 0.054347
Total Refunds 0.003264
Total Extra Data Charges -0.058443
Total Long Distance Charges 0.125527
Total Revenue 0.080137
Internet Type_Cable -0.496382
Internet Type_DSL -0.750457
Internet Type_Fiber Optic 1.000000
Contract_Month-to-Month 0.046488
Contract_One Year -0.042804
Contract_Two Year -0.011501
Payment Method_Bank Withdrawal 0.074140
Payment Method_Credit Card -0.056188
Payment Method_Mailed Check -0.042518
Contract_Month-to-Month Contract_One Year \
Gender -0.005772 0.008596
Age -0.001283 -0.010480
Married -0.252296 0.069478
Number of Dependents -0.128389 0.004264
Number of Referrals -0.272251 0.066514
Tenure in Months -0.614856 0.168878
Phone Service -0.002087 0.000181
Avg Monthly Long Distance Charges -0.002238 0.030331
Multiple Lines -0.070235 -0.020749
Internet Service 0.223913 -0.035474
Avg Monthly GB Download 0.000653 0.019519
Online Security -0.225947 0.091162
Online Backup -0.154417 0.074896
Device Protection Plan -0.203021 0.082671
Premium Tech Support -0.256663 0.078670
Streaming TV -0.276596 0.079672
Streaming Movies -0.286515 0.082180
Streaming Music -0.067183 0.032663
Unlimited Data -0.050649 0.000779
Paperless Billing 0.171170 -0.051747
Monthly Charge 0.067649 -0.003094
Total Charges -0.418397 0.145967
Total Refunds -0.037443 0.004841
Total Extra Data Charges -0.003023 0.021882
Total Long Distance Charges -0.399747 0.131272
Total Revenue -0.451532 0.155290
Internet Type_Cable -0.006728 0.010886
Internet Type_DSL -0.047333 0.040011
Internet Type_Fiber Optic 0.046488 -0.042804
Contract_Month-to-Month 1.000000 -0.533797
Contract_One Year -0.533797 1.000000
Contract_Two Year -0.610011 -0.344435
Payment Method_Bank Withdrawal 0.163780 -0.053275
Payment Method_Credit Card -0.186632 0.054667
Payment Method_Mailed Check 0.042806 -0.000710
Contract_Two Year \
Gender -0.001647
Age 0.011245
Married 0.214997
Number of Dependents 0.138544
Number of Referrals 0.239930
Tenure in Months 0.524372
Phone Service 0.002147
Avg Monthly Long Distance Charges -0.025937
Multiple Lines 0.097419
Internet Service -0.215349
Avg Monthly GB Download -0.019016
Online Security 0.165425
Online Backup 0.101252
Device Protection Plan 0.147928
Premium Tech Support 0.211232
Streaming TV 0.232423
Streaming Movies 0.241085
Streaming Music 0.043980
Unlimited Data 0.055501
Paperless Billing -0.141546
Monthly Charge -0.072206
Total Charges 0.327729
Total Refunds 0.037034
Total Extra Data Charges -0.017149
Total Long Distance Charges 0.320794
Total Revenue 0.355781
Internet Type_Cable -0.002732
Internet Type_DSL 0.015057
Internet Type_Fiber Optic -0.011501
Contract_Month-to-Month -0.610011
Contract_One Year -0.344435
Contract_Two Year 1.000000
Payment Method_Bank Withdrawal -0.131909
Payment Method_Credit Card 0.155975
Payment Method_Mailed Check -0.046859
Payment Method_Bank Withdrawal \
Gender -0.005451
Age 0.114948
Married -0.003856
Number of Dependents -0.089444
Number of Referrals -0.053171
Tenure in Months -0.025183
Phone Service 0.009661
Avg Monthly Long Distance Charges 0.013207
Multiple Lines 0.132185
Internet Service 0.259444
Avg Monthly GB Download -0.005962
Online Security -0.039649
Online Backup 0.059466
Device Protection Plan 0.055254
Premium Tech Support -0.035901
Streaming TV -0.049095
Streaming Movies -0.050527
Streaming Music 0.121618
Unlimited Data -0.058125
Paperless Billing 0.179961
Monthly Charge 0.268894
Total Charges 0.082373
Total Refunds 0.026451
Total Extra Data Charges 0.046901
Total Long Distance Charges -0.009761
Total Revenue 0.062803
Internet Type_Cable -0.035113
Internet Type_DSL -0.056924
Internet Type_Fiber Optic 0.074140
Contract_Month-to-Month 0.163780
Contract_One Year -0.053275
Contract_Two Year -0.131909
Payment Method_Bank Withdrawal 1.000000
Payment Method_Credit Card -0.897752
Payment Method_Mailed Check -0.267501
Payment Method_Credit Card \
Gender 0.007691
Age -0.101897
Married 0.033121
Number of Dependents 0.099719
Number of Referrals 0.077497
Tenure in Months 0.084859
Phone Service 0.004319
Avg Monthly Long Distance Charges -0.010261
Multiple Lines -0.093775
Internet Service -0.225085
Avg Monthly GB Download 0.007092
Online Security 0.051155
Online Backup -0.023816
Device Protection Plan -0.028157
Premium Tech Support 0.052222
Streaming TV 0.057325
Streaming Movies 0.062196
Streaming Music -0.092474
Unlimited Data 0.052610
Paperless Billing -0.156198
Monthly Charge -0.215994
Total Charges -0.021773
Total Refunds -0.015421
Total Extra Data Charges -0.033446
Total Long Distance Charges 0.053947
Total Revenue -0.001403
Internet Type_Cable 0.012819
Internet Type_DSL 0.053642
Internet Type_Fiber Optic -0.056188
Contract_Month-to-Month -0.186632
Contract_One Year 0.054667
Contract_Two Year 0.155975
Payment Method_Bank Withdrawal -0.897752
Payment Method_Credit Card 1.000000
Payment Method_Mailed Check -0.184299
Payment Method_Mailed Check
Gender -0.004661
Age -0.033586
Married -0.063845
Number of Dependents -0.018555
Number of Referrals -0.050879
Tenure in Months -0.129430
Phone Service -0.031003
Avg Monthly Long Distance Charges -0.007024
Multiple Lines -0.089814
Internet Service -0.086532
Avg Monthly GB Download -0.002210
Online Security -0.023430
Online Backup -0.080588
Device Protection Plan -0.061693
Premium Tech Support -0.034127
Streaming TV -0.015851
Streaming Movies -0.023309
Streaming Music -0.069081
Unlimited Data 0.014612
Paperless Billing -0.059869
Monthly Charge -0.127502
Total Charges -0.136169
Total Refunds -0.025287
Total Extra Data Charges -0.031488
Total Long Distance Charges -0.096225
Total Revenue -0.137061
Internet Type_Cable 0.050307
Internet Type_DSL 0.009676
Internet Type_Fiber Optic -0.042518
Contract_Month-to-Month 0.042806
Contract_One Year -0.000710
Contract_Two Year -0.046859
Payment Method_Bank Withdrawal -0.267501
Payment Method_Credit Card -0.184299
Payment Method_Mailed Check 1.000000
[35 rows x 35 columns]
# Correlation Plot
plt.figure(figsize=(18, 18))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1, annot_kws={"size": 10})
plt.xticks(rotation=90)
plt.yticks(rotation=0)
# Add title and adjust layout
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
# Show plot
plt.savefig('Correlation Matrix of Numerical Features.jpg')
plt.show()
# Checking for highly Correlated Features
# Setting a threshold for high correlation
threshold = 0.7
# Initializing a list to store highly correlated features pairs
highly_correlated_pairs = []
# Iterating through the correlation matrix
for i in range(len(correlation_matrix.columns)):
for j in range(i+1, len(correlation_matrix.columns)):
if abs(correlation_matrix.iloc[i, j]) > threshold:
# Store the names of highly correlated features
pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
highly_correlated_pairs.append(pair)
if not highly_correlated_pairs:
print("No highly correlated features found.")
else:
print("Highly correlated feature pairs:")
for pair in highly_correlated_pairs:
print(pair)
Highly correlated feature pairs:
('Tenure in Months', 'Total Charges')
('Tenure in Months', 'Total Revenue')
('Internet Service', 'Monthly Charge')
('Unlimited Data', 'Total Extra Data Charges')
('Total Charges', 'Total Revenue')
('Total Long Distance Charges', 'Total Revenue')
('Internet Type_DSL', 'Internet Type_Fiber Optic')
('Payment Method_Bank Withdrawal', 'Payment Method_Credit Card')
# Removal of highly correlated features
Churn.drop(['Tenure in Months', 'Total Charges','Total Revenue', 'Internet Service', 'Monthly Charge','Unlimited Data',
'Total Extra Data Charges', 'Total Long Distance Charges', 'Internet Type_DSL', 'Internet Type_Fiber Optic', 'Payment Method_Bank Withdrawal', 'Payment Method_Credit Card'], axis =1, inplace= True)
#Checking Columns in dataset after feature removals
print(Churn.columns)
Index(['Gender', 'Age', 'Married', 'Number of Dependents',
'Number of Referrals', 'Phone Service',
'Avg Monthly Long Distance Charges', 'Multiple Lines',
'Avg Monthly GB Download', 'Online Security', 'Online Backup',
'Device Protection Plan', 'Premium Tech Support', 'Streaming TV',
'Streaming Movies', 'Streaming Music', 'Paperless Billing',
'Total Refunds', 'Customer Status', 'Internet Type_Cable',
'Contract_Month-to-Month', 'Contract_One Year', 'Contract_Two Year',
'Payment Method_Mailed Check'],
dtype='object')
# Selecting only numerical columns
numerical_columns = Churn.select_dtypes(include=['int64', 'float64']).columns
# Calculate VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = numerical_columns
vif_data["VIF"] = [variance_inflation_factor(Churn[numerical_columns].values, i) for i in range(len(numerical_columns))]
print("Variance Inflation Factors:")
print(vif_data)
Variance Inflation Factors:
Feature VIF
0 Age 2.338980
1 Number of Dependents 1.413638
2 Number of Referrals 1.535375
3 Avg Monthly Long Distance Charges 3.295581
4 Avg Monthly GB Download 2.371388
5 Total Refunds 1.062786
The Variance Inflation Factor showed that all the numerical features have a variance of less than 10, so none of them would be removed.
# Iterate over each feature (excluding the target variable) in the dataset
for feature in Churn.columns[:-1]:
# Check if the feature is integer-encoded (assuming categorical)
if Churn[feature].dtype == 'int32':
# Create a contingency table for the feature and the target variable
contingency_table = pd.crosstab(Churn[feature], Churn['Customer Status'])
# Perform the chi-square test
chi2_statistic, p_value, _, _ = chi2_contingency(contingency_table)
# Print the results
print(f"Chi-square test results for {feature}:")
print(f" Chi-square statistic: {chi2_statistic}")
print(f" P-value: {p_value}")
# Check for significance (e.g., p-value < 0.05)
if p_value < 0.05:
print(f" {feature} is statistically significant for predicting the target variable.")
else:
print(f" {feature} is not statistically significant for predicting the target variable.")
print()
Chi-square test results for Gender: Chi-square statistic: 0.24007105217966312 P-value: 0.6241548018547587 Gender is not statistically significant for predicting the target variable. Chi-square test results for Married: Chi-square statistic: 220.5060439643492 P-value: 7.014602453362894e-50 Married is statistically significant for predicting the target variable. Chi-square test results for Phone Service: Chi-square statistic: 1.2551734871989866 P-value: 0.26256666476273127 Phone Service is not statistically significant for predicting the target variable. Chi-square test results for Multiple Lines: Chi-square statistic: 1.1535971852605198 P-value: 0.2827973969037025 Multiple Lines is not statistically significant for predicting the target variable. Chi-square test results for Online Security: Chi-square statistic: 248.43749367640638 P-value: 5.689634112881904e-56 Online Security is statistically significant for predicting the target variable. Chi-square test results for Online Backup: Chi-square statistic: 73.07537713497942 P-value: 1.247916969909602e-17 Online Backup is statistically significant for predicting the target variable. Chi-square test results for Device Protection Plan: Chi-square statistic: 56.671577391479005 P-value: 5.1502608160860495e-14 Device Protection Plan is statistically significant for predicting the target variable. Chi-square test results for Premium Tech Support: Chi-square statistic: 231.66761732586693 P-value: 2.580290732051722e-52 Premium Tech Support is statistically significant for predicting the target variable. Chi-square test results for Streaming TV: Chi-square statistic: 135.09725737576034 P-value: 3.1438574036725362e-31 Streaming TV is statistically significant for predicting the target variable. Chi-square test results for Streaming Movies: Chi-square statistic: 139.63452425587423 P-value: 3.199893781319627e-32 Streaming Movies is statistically significant for predicting the target variable. Chi-square test results for Streaming Music: Chi-square statistic: 4.274339099232788 P-value: 0.03869200784160805 Streaming Music is statistically significant for predicting the target variable. Chi-square test results for Paperless Billing: Chi-square statistic: 231.29426752260608 P-value: 3.112345421939546e-52 Paperless Billing is statistically significant for predicting the target variable. Chi-square test results for Customer Status: Chi-square statistic: 6584.07952972205 P-value: 0.0 Customer Status is statistically significant for predicting the target variable. Chi-square test results for Internet Type_Cable: Chi-square statistic: 0.26359201971038076 P-value: 0.6076630592581913 Internet Type_Cable is not statistically significant for predicting the target variable. Chi-square test results for Contract_Month-to-Month: Chi-square statistic: 1665.0430834588672 P-value: 0.0 Contract_Month-to-Month is statistically significant for predicting the target variable. Chi-square test results for Contract_One Year: Chi-square statistic: 297.7642870461098 P-value: 1.0112624954818955e-66 Contract_One Year is statistically significant for predicting the target variable. Chi-square test results for Contract_Two Year: Chi-square statistic: 846.9257076455871 P-value: 3.3878596704718002e-186 Contract_Two Year is statistically significant for predicting the target variable.
# Removal of non statistically significant features
Churn.drop(["Gender", "Phone Service", "Multiple Lines", "Internet Type_Cable"], axis =1, inplace= True)
# Checking Shape of dataset after removal of irrelevant features
Churn.shape
(6589, 20)
#Splitting the dataset into features (X) and target variable (Y)
X = Churn.drop(columns=['Customer Status'], axis=1)
Y = Churn["Customer Status"]
# Shuffle your data
X,Y = shuffle(X, Y, random_state=42)
X.head()
| Age | Married | Number of Dependents | Number of Referrals | Avg Monthly Long Distance Charges | Avg Monthly GB Download | Online Security | Online Backup | Device Protection Plan | Premium Tech Support | Streaming TV | Streaming Movies | Streaming Music | Paperless Billing | Total Refunds | Contract_Month-to-Month | Contract_One Year | Contract_Two Year | Payment Method_Mailed Check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2059 | 0.606557 | 0 | 0.000000 | 0.000000 | 0.111474 | 21.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.00 | 1 | 0 | 0 | 0 |
| 4323 | 0.491803 | 1 | 0.000000 | 0.909091 | 0.903838 | 21.0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0.00 | 0 | 1 | 0 | 0 |
| 1544 | 0.163934 | 1 | 0.222222 | 0.909091 | 0.504492 | 69.0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0.00 | 0 | 0 | 1 | 0 |
| 2644 | 0.770492 | 1 | 0.000000 | 0.545455 | 0.634341 | 8.0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0.00 | 0 | 1 | 0 | 0 |
| 4029 | 0.655738 | 1 | 0.111111 | 1.000000 | 0.841364 | 25.0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 25.49 | 0 | 1 | 0 | 0 |
Y.head()
2059 0 4323 1 1544 1 2644 1 4029 1 Name: Customer Status, dtype: int32
import statsmodels.api as sm
logit_model=sm.Logit(Y,X)
result=logit_model.fit()
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.365385
Iterations 8
Results: Logit
==================================================================================
Model: Logit Method: MLE
Dependent Variable: Customer Status Pseudo R-squared: 0.387
Date: 2024-05-05 21:37 AIC: 4853.0382
No. Observations: 6589 BIC: 4982.1082
Df Model: 18 Log-Likelihood: -2407.5
Df Residuals: 6570 LL-Null: -3929.5
Converged: 1.0000 LLR p-value: 0.0000
No. Iterations: 8.0000 Scale: 1.0000
----------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------
Age -1.4799 0.1598 -9.2624 0.0000 -1.7931 -1.1668
Married -1.4031 0.1056 -13.2850 0.0000 -1.6101 -1.1961
Number of Dependents 6.2976 0.5564 11.3175 0.0000 5.2070 7.3882
Number of Referrals 5.9307 0.3722 15.9355 0.0000 5.2013 6.6602
Avg Monthly Long Distance Charges 0.0518 0.1300 0.3983 0.6904 -0.2031 0.3067
Avg Monthly GB Download -0.0047 0.0025 -1.9219 0.0546 -0.0095 0.0001
Online Security 0.5956 0.0909 6.5535 0.0000 0.4174 0.7737
Online Backup 0.2666 0.0814 3.2754 0.0011 0.1071 0.4261
Device Protection Plan -0.0436 0.0838 -0.5205 0.6027 -0.2079 0.1206
Premium Tech Support 0.4536 0.0911 4.9818 0.0000 0.2752 0.6321
Streaming TV -0.1720 0.0849 -2.0261 0.0428 -0.3383 -0.0056
Streaming Movies 0.7710 0.1100 7.0106 0.0000 0.5554 0.9865
Streaming Music -1.0547 0.1081 -9.7557 0.0000 -1.2666 -0.8428
Paperless Billing -0.4579 0.0792 -5.7805 0.0000 -0.6131 -0.3026
Total Refunds 0.0095 0.0048 1.9988 0.0456 0.0002 0.0189
Contract_Month-to-Month 0.6522 0.1538 4.2414 0.0000 0.3508 0.9536
Contract_One Year 2.6115 0.1885 13.8548 0.0000 2.2420 2.9809
Contract_Two Year 3.8988 0.2299 16.9601 0.0000 3.4482 4.3493
Payment Method_Mailed Check -0.7775 0.1495 -5.2020 0.0000 -1.0705 -0.4846
==================================================================================
#Removing features with P values higher than 0.05
# List of columns to drop
columns_to_drop = ['Avg Monthly Long Distance Charges','Avg Monthly GB Download', 'Device Protection Plan']
# Drop the specified columns from the DataFrame X
X = X.drop(columns_to_drop, axis=1)
logit_model=sm.Logit(Y,X)
result2=logit_model.fit()
print(result2.summary2())
Optimization terminated successfully.
Current function value: 0.365699
Iterations 8
Results: Logit
============================================================================
Model: Logit Method: MLE
Dependent Variable: Customer Status Pseudo R-squared: 0.387
Date: 2024-05-05 21:37 AIC: 4851.1792
No. Observations: 6589 BIC: 4959.8697
Df Model: 15 Log-Likelihood: -2409.6
Df Residuals: 6573 LL-Null: -3929.5
Converged: 1.0000 LLR p-value: 0.0000
No. Iterations: 8.0000 Scale: 1.0000
----------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
----------------------------------------------------------------------------
Age -1.3355 0.1384 -9.6518 0.0000 -1.6067 -1.0643
Married -1.4060 0.1055 -13.3273 0.0000 -1.6128 -1.1992
Number of Dependents 6.1496 0.5511 11.1589 0.0000 5.0695 7.2298
Number of Referrals 5.9200 0.3718 15.9218 0.0000 5.1913 6.6488
Online Security 0.5932 0.0908 6.5345 0.0000 0.4153 0.7711
Online Backup 0.2601 0.0810 3.2106 0.0013 0.1013 0.4189
Premium Tech Support 0.4500 0.0905 4.9719 0.0000 0.2726 0.6274
Streaming TV -0.1733 0.0846 -2.0491 0.0405 -0.3390 -0.0075
Streaming Movies 0.7908 0.1091 7.2481 0.0000 0.5769 1.0046
Streaming Music -1.0849 0.1056 -10.2775 0.0000 -1.2918 -0.8780
Paperless Billing -0.4611 0.0791 -5.8289 0.0000 -0.6162 -0.3061
Total Refunds 0.0095 0.0048 1.9952 0.0460 0.0002 0.0189
Contract_Month-to-Month 0.4972 0.1044 4.7645 0.0000 0.2927 0.7018
Contract_One Year 2.4482 0.1481 16.5328 0.0000 2.1580 2.7384
Contract_Two Year 3.7308 0.1980 18.8387 0.0000 3.3426 4.1189
Payment Method_Mailed Check -0.7758 0.1493 -5.1955 0.0000 -1.0685 -0.4832
============================================================================
# Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Checking the shape of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)
Shape of X_train: (4612, 16) Shape of X_test: (1977, 16) Shape of Y_train: (4612,) Shape of Y_test: (1977,)
# Count the occurrences of each class
class_counts = Y_train.value_counts()
# Reorder the class counts so that Churned (0) is displayed before Stayed (1)
class_counts_reordered = class_counts.reindex([0, 1])
# Plot the class distribution
plt.figure(figsize=(8, 6))
class_counts_reordered.plot(kind='bar', color=['red', 'skyblue',])
plt.xlabel('Customer Status')
plt.ylabel('Count')
plt.title('Class Distribution before Upsampling')
plt.xticks([0, 1], ['Churned', 'Stayed'], rotation=0) # Set x-axis ticks to display class labels
plt.savefig("Target variable distribution.jpg")
plt.show()
# Initialize SMOTE with a desired sampling strategy
smote = SMOTE(sampling_strategy='auto', random_state=42)
# Upsample the minority class in the training data
X_train, Y_train = smote.fit_resample(X_train, Y_train)
X_test, Y_test = smote.fit_resample(X_test, Y_test)
# Count the occurrences of each class
class_counts = Y_train.value_counts()
# Reorder the class counts so that Churned (0) is displayed before Stayed (1)
class_counts_reordered = class_counts.reindex([0, 1])
# Plot the class distribution
plt.figure(figsize=(8, 6))
class_counts_reordered.plot(kind='bar', color=['red', 'skyblue',])
plt.xlabel('Customer Status')
plt.ylabel('Count')
plt.title('Class Distribution after Upsampling')
plt.xticks([0, 1], ['Churned', 'Stayed'], rotation=0)
plt.savefig("Distribution after SMOTE.jpg")
plt.show()
Y_train.value_counts()
Customer Status 1 3327 0 3327 Name: count, dtype: int64
Y_test.value_counts()
Customer Status 0 1393 1 1393 Name: count, dtype: int64
# Train a Logistic Regression Model (using the default parameters)
log_model = LogisticRegression(random_state=16, max_iter=1000)
# fit the model with data
log_model.fit(X_train, Y_train)
LogisticRegression(max_iter=1000, random_state=16)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, random_state=16)
y_pred_log = log_model.predict(X_test)
#printing accuracy
accuracy_log = accuracy_score(Y_test, y_pred_log)
precision_log = precision_score(Y_test, y_pred_log)
recall_log = recall_score(Y_test, y_pred_log)
f1_log = f1_score(Y_test, y_pred_log)
print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1 Score:", f1_log)
# Print classification report
print("Logistic Regression Classification Report before optimisation:")
print(classification_report(Y_test, y_pred_log))
Accuracy: 0.8201722900215362
Precision: 0.8534072900158478
Recall: 0.7731514716439339
F1 Score: 0.8112994350282486
Logistic Regression Classification Report before optimisation:
precision recall f1-score support
0 0.79 0.87 0.83 1393
1 0.85 0.77 0.81 1393
accuracy 0.82 2786
macro avg 0.82 0.82 0.82 2786
weighted avg 0.82 0.82 0.82 2786
# Define the parameter grid
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga']
}
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Creating a logistic regression model
logistic_regression = LogisticRegression(max_iter=1000, random_state=16)
# The GridSearchCV object
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')
# Performing grid search to find the best hyperparameters
grid_search.fit(X_train, Y_train)
GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000, random_state=16),
param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga']},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000, random_state=16),
param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga']},
scoring='accuracy')LogisticRegression(max_iter=1000, random_state=16)
LogisticRegression(max_iter=1000, random_state=16)
# Printing the best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)
Best hyperparameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# Get the best model
best_model = grid_search.best_estimator_
# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
# Calculate accuracy
accuracy_logreg = accuracy_score(Y_test, y_pred_best)
precision_logreg = precision_score(Y_test, y_pred_best)
recall_logreg = recall_score(Y_test, y_pred_best)
f1_logreg = f1_score(Y_test, y_pred_best)
print("Accuracy of best model:", accuracy_logreg)
print("Precision of best model:", precision_logreg)
print("Recall of best model:", recall_logreg)
print("F1 of best model:", f1_logreg)
# Print classification report
print("Logistic Regression Classification Report:")
print(classification_report(Y_test, y_pred_best))
Accuracy of best model: 0.8230437903804738
Precision of best model: 0.8543307086614174
Recall of best model: 0.7788944723618091
F1 of best model: 0.8148704468644387
Logistic Regression Classification Report:
precision recall f1-score support
0 0.80 0.87 0.83 1393
1 0.85 0.78 0.81 1393
accuracy 0.82 2786
macro avg 0.83 0.82 0.82 2786
weighted avg 0.83 0.82 0.82 2786
# Plot the confusion matrix for the Logistic Regression Model
cm_log = confusion_matrix(Y_test, y_pred_best)
disp_log = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_log);
disp_log.plot(cmap='Oranges')
plt.title('Confusion Matrix (Logistic Regression)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('Confusion Matrix for Logistic Regression.jpg')
plt.show()
# Calculate Precision and Recall for the optimized model
precision_best, recall_best, thresholds_best = precision_recall_curve(Y_test, y_pred_best)
# Create Precision-Recall Curve
fig, ax = plt.subplots()
ax.plot(recall_best, precision_best, color='purple')
# Adding axis labels and title to the plot
ax.set_title('Precision-Recall Curve for Optimised Logistic Regression Model')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
# Save the plot as an image file and display
plt.savefig('Precision-Recall Curve for Optimised Logistic Regression Model.jpg')
plt.show()
# Calculate False Positive Rate, True Positive Rate, and Thresholds for the optimized model
fpr_best, tpr_best, thresholds_best = roc_curve(Y_test, y_pred_best)
# Calculate the AUC score for the optimized model
auc_score_best = roc_auc_score(Y_test, y_pred_best)
# Plot ROC curve for the optimized model
plt.figure(figsize=(8, 6))
plt.plot(fpr_best, tpr_best, color='blue', lw=2, label=f'AUC = {auc_score_best:.2f}')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Optimised Logistic Regression Model')
plt.legend(loc='center')
plt.grid(True)
plt.savefig('Receiver Operating Characteristic (ROC) Curve for Optimised Logistic Regression Model.jpg')
plt.show()
# Train a Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
y_pred_rf = rf_model.predict(X_test)
#printing accuracy
accuracy = accuracy_score(Y_test, y_pred_rf)
print("Accuracy:", accuracy)
# Print classification report
print("Random Forest Classification Report before optimisation:")
print(classification_report(Y_test, y_pred_rf))
Accuracy: 0.7954055994256999
Random Forest Classification Report before optimisation:
precision recall f1-score support
0 0.83 0.74 0.78 1393
1 0.76 0.85 0.81 1393
accuracy 0.80 2786
macro avg 0.80 0.80 0.79 2786
weighted avg 0.80 0.80 0.79 2786
# Convert X_train to a pandas DataFrame
X_train_df = pd.DataFrame(X_train)
from sklearn.tree import plot_tree
# Export the first three decision trees from the forest
for i in range(3):
plt.figure(figsize=(10, 5))
tree = rf_model.estimators_[i]
plot_tree(tree,
feature_names=X_train_df.columns,
filled=True,
max_depth=2,
impurity=False,
proportion=True)
plt.show()
# Define the parameter grid for RandomForestClassifier
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# The grid search
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=3)
# Fit the grid search to the data
grid_search.fit(X_train, Y_train)
GridSearchCV(cv=3, estimator=RandomForestClassifier(),
param_grid={'max_depth': [None, 10, 20],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [100, 200, 300]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=3, estimator=RandomForestClassifier(),
param_grid={'max_depth': [None, 10, 20],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [100, 200, 300]})RandomForestClassifier()
RandomForestClassifier()
# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
# Print the best parameters
print("Best Parameters:", best_params)
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
# Generate predictions with the best model
y_pred_best_rf = best_estimator.predict(X_test)
accuracy_rf = accuracy_score(Y_test, y_pred_best_rf)
precision_rf = precision_score(Y_test, y_pred_best_rf)
recall_rf = recall_score(Y_test, y_pred_best_rf)
f1_rf = f1_score(Y_test, y_pred_best_rf)
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)
# Print classification report
print("Random Forest Classification Report:")
print(classification_report(Y_test, y_pred_best_rf))
Accuracy: 0.8277099784637473
Precision: 0.8217054263565892
Recall: 0.8370423546302943
F1 Score: 0.829302987197724
Random Forest Classification Report:
precision recall f1-score support
0 0.83 0.82 0.83 1393
1 0.82 0.84 0.83 1393
accuracy 0.83 2786
macro avg 0.83 0.83 0.83 2786
weighted avg 0.83 0.83 0.83 2786
# Plot the confusion matrix for the Random Forest Model
cm = confusion_matrix(Y_test, y_pred_best_rf)
disp_rf = metrics.ConfusionMatrixDisplay(confusion_matrix=cm);
disp_rf.plot(cmap='Greens')
plt.title('Confusion Matrix (Random Forest)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig("Confusion Matrix for Random Forest")
plt.show()
# Calculate Precision and Recall
precision, recall, thresholds = precision_recall_curve(Y_test, best_estimator.predict_proba(X_test)[:, 1])
# Create Precision Recall Curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')
# Adding axis labels and title to the plot
ax.set_title('Precision-Recall Curve for Random Forest')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
plt.savefig('Precision-Recall Curve for Random Forest.jpg')
plt.show()
# Calculate the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(Y_test, y_pred_best_rf)
# Calculate the AUC score
auc_score = roc_auc_score(Y_test, y_pred_best_rf)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Random Forest Classifier')
plt.legend(loc='center')
plt.grid(True)
plt.savefig('Receiver Operating Characteristic (ROC) Curve for Random Forest Classifier.jpg')
plt.show()
# Initializing the KNN classifier with a specified number of neighbors (e.g., 3)
knn = KNeighborsClassifier(n_neighbors=3)
# Training the classifier on the training data
knn.fit(X_train, Y_train)
KNeighborsClassifier(n_neighbors=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=3)
#Evaluating the performance of the model using the test set
knn_Y_predict = knn.predict(X_test)
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(Y_test, knn_Y_predict))
Accuracy: 0.7613065326633166
# Print classification report
print("KNN Classification Report:")
print(classification_report(Y_test,knn_Y_predict))
KNN Classification Report:
precision recall f1-score support
0 0.77 0.74 0.76 1393
1 0.75 0.78 0.77 1393
accuracy 0.76 2786
macro avg 0.76 0.76 0.76 2786
weighted avg 0.76 0.76 0.76 2786
# Defining the range of values for n_neighbors
neighbors = np.arange(1, 36)
# Create an empty list to store the accuracy at different values of K
KNN_accuracy = []
# Loop through each value of n_neighbors and perform grid search
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k)
grid_search = GridSearchCV(knn, {'n_neighbors': [k]}, cv=5)
grid_search.fit(X_train, Y_train)
KNN_accuracy.append(grid_search.best_score_)
# Plot the mean test scores
plt.figure(figsize=(10, 6))
plt.plot(neighbors, KNN_accuracy, marker='o')
plt.title('KNN Accuracy for Different Values of K')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Accuracy')
plt.grid(True)
plt.xticks(np.arange(0, 36, step=2))
plt.savefig('KNN Accuracy for Different Values of K.jpg')
plt.show()
best_index = np.argmax(KNN_accuracy)
best_k = neighbors[best_index]
print(f"The best value of n_neighbors (K) is: {best_k}")
The best value of n_neighbors (K) is: 3
knn= KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train,Y_train)
best_k_predictions = knn.predict(X_test)
# Calculate precision and recall for the best K value
accuracy_knn = accuracy_score(Y_test, best_k_predictions)
precision_knn = precision_score(Y_test, best_k_predictions)
recall_knn = recall_score(Y_test, best_k_predictions)
f1_knn = f1_score(Y_test, best_k_predictions)
# Print the accuracy, precision, and recall for the best K value
print(f"Best K value: {best_k}")
print(f"Accuracy for best K value: {accuracy_knn}")
print(f"Precision for best K value: {precision_knn}")
print(f"Recall for best K value: {recall_knn}")
print(f"F1 Score for best K value: {f1_knn}")
# Print classification report
print("KNN Classification Report:")
print(classification_report(Y_test ,best_k_predictions))
Best K value: 3
Accuracy for best K value: 0.7613065326633166
Precision for best K value: 0.7506887052341598
Recall for best K value: 0.782483847810481
F1 Score for best K value: 0.7662565905096661
KNN Classification Report:
precision recall f1-score support
0 0.77 0.74 0.76 1393
1 0.75 0.78 0.77 1393
accuracy 0.76 2786
macro avg 0.76 0.76 0.76 2786
weighted avg 0.76 0.76 0.76 2786
# Plot the confusion matrix for the KNN Model
cm_knn = confusion_matrix(Y_test, best_k_predictions)
disp_knn = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_knn);
disp_knn.plot(cmap='Purples')
plt.title(f'Confusion Matrix for KNN (K={best_k})')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('Confusion Matrix for KNN.jpg')
plt.show()
# Calculate Precision and Recall
precision, recall, thresholds = precision_recall_curve(Y_test, knn.predict_proba(X_test)[:, 1])
# Create Precision Recall Curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')
# Adding axis labels and title to the plot
ax.set_title('Precision-Recall Curve for KNN')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
plt.savefig('Precision-Recall Curve for KNN.jpg')
plt.show()
# Calculate the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(Y_test, best_k_predictions)
# Calculate the AUC score
auc_score = roc_auc_score(Y_test, best_k_predictions)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for KNN')
plt.legend(loc='center')
plt.grid(True)
plt.savefig('Receiver Operating Characteristic (ROC) Curve for KNN.jpg')
plt.show()
# Initializing the GNB classifier
GNB = GaussianNB()
# Fitting the classifier to the training data
GNB.fit(X_train, Y_train)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
# Predicting the target variable for the test data
gnb_Y_predict = GNB.predict(X_test)
# Calculate precision and recall for the GNB Model
accuracy_gnb = accuracy_score(Y_test, gnb_Y_predict)
precision_gnb = precision_score(Y_test, gnb_Y_predict)
recall_gnb= recall_score(Y_test, gnb_Y_predict)
f1_gnb= f1_score(Y_test, gnb_Y_predict)
# Print the accuracy, precision, and recall for GNB Model
print(f"Accuracy for GNB Model: {accuracy_gnb} ")
print(f"Precision for GNB Model: {precision_gnb}")
print(f"Recall for GNB Model: {recall_gnb}")
print(f"F1 Score for GNB Model: {f1_gnb}")
Accuracy for GNB Model: 0.7989949748743719 Precision for GNB Model: 0.8078344419807835 Recall for GNB Model: 0.7846374730796841 F1 Score for GNB Model: 0.7960670065549891
# Print classification report
print("GNB Classification Report:")
print(classification_report(Y_test, gnb_Y_predict))
GNB Classification Report:
precision recall f1-score support
0 0.79 0.81 0.80 1393
1 0.81 0.78 0.80 1393
accuracy 0.80 2786
macro avg 0.80 0.80 0.80 2786
weighted avg 0.80 0.80 0.80 2786
# Define the parameter grid
param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}
# Initialize GridSearchCV
grid_search = GridSearchCV(GNB, param_grid, cv=5, scoring='accuracy')
# Fit the grid search to the data
grid_search.fit(X_train, Y_train)
# Get the best parameters
best_params = grid_search.best_params_
# Initialize GNB with the best parameters
optimised_GNB = GaussianNB(var_smoothing=best_params['var_smoothing'])
# Fit the optimized model to the training data
optimised_GNB.fit(X_train, Y_train)
# Predict the target variable for the test data
optimised_gnb_Y_predict = optimised_GNB.predict(X_test)
# Calculate and print accuracy, precision, and recall for the optimized GNB model
accuracy_gnb = accuracy_score(Y_test, optimised_gnb_Y_predict)
precision_gnb = precision_score(Y_test, optimised_gnb_Y_predict)
recall_gnb = recall_score(Y_test, optimised_gnb_Y_predict)
f1_gnb = f1_score(Y_test, optimised_gnb_Y_predict)
print(f"Accuracy for Optimised GNB Model: {accuracy_gnb}")
print(f"Precision for Optimised GNB Model: {precision_gnb}")
print(f"Recall for Optimised GNB Model: {recall_gnb}")
print(f"F1 Score for Optimised GNB Model: {f1_gnb}")
# Print classification report
print("GNB Classification Report:")
print(classification_report(Y_test, optimised_gnb_Y_predict))
Accuracy for Optimised GNB Model: 0.7982770997846375
Precision for Optimised GNB Model: 0.8070953436807096
Recall for Optimised GNB Model: 0.7839195979899497
F1 Score for Optimised GNB Model: 0.7953386744355426
GNB Classification Report:
precision recall f1-score support
0 0.79 0.81 0.80 1393
1 0.81 0.78 0.80 1393
accuracy 0.80 2786
macro avg 0.80 0.80 0.80 2786
weighted avg 0.80 0.80 0.80 2786
# Plot the confusion matrix for the GNB Model
cm_gnb = confusion_matrix(Y_test, optimised_gnb_Y_predict)
disp_gnb = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_knn);
disp_gnb.plot(cmap='Reds')
plt.title(f'Confusion Matrix for GNB Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('Confusion Matrix for GNB.jpg')
plt.show()
# Calculate Precision and Recall
precision, recall, thresholds = precision_recall_curve(Y_test, optimised_GNB.predict_proba(X_test)[:, 1])
# Create Precision Recall Curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')
# Adding axis labels and title to the plot
ax.set_title('Precision-Recall Curve for GNB')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
plt.savefig('Precision-Recall Curve for GNB.jpg')
plt.show()
# Calculate the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(Y_test, optimised_gnb_Y_predict)
# Calculate the AUC score
auc_score = roc_auc_score(Y_test, optimised_gnb_Y_predict)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Guassian Naives Bayes')
plt.legend(loc='center')
plt.grid(True)
plt.savefig('Receiver Operating Characteristic (ROC) Curve for GNB.jpg')
plt.show()
# Classifier names
classifiers = ['Logistic_Regression', 'Random_Forest', 'KNN', 'Gaussian Naive Bayes']
# Accuracy scores obtained from cross-validation or testing
accuracy_scores = [accuracy_logreg, accuracy_rf, accuracy_knn, accuracy_gnb]
# Define colors for each classifier
colors = ['skyblue', 'green', 'red', 'purple']
# Create a bar plot
plt.figure(figsize=(10, 6))
bars = plt.barh(classifiers, accuracy_scores, color=colors)
plt.xlabel('Accuracy')
plt.title('Accuracy of Different Classifiers')
plt.gca().invert_yaxis()
# Add labels on bars
for bar, score in zip(bars, accuracy_scores):
plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{score:.2f}',
va='center', ha='left', fontsize=9)
plt.savefig('Model Comparision.jpg')
plt.show()
# Calculate ROC curves and AUC scores for each model
models = {
'Random Forest': (y_pred_best_rf, 'blue'),
'Logistic Regression': (y_pred_best, 'orange'),
'KNN': (best_k_predictions, 'green'),
'Gaussian Naive Bayes': (optimised_gnb_Y_predict, 'purple')
}
plt.figure(figsize=(10, 8))
# Plot ROC curves for each model
for model_name, (y_pred, color) in models.items():
fpr, tpr, _ = roc_curve(Y_test, y_pred)
auc_score = roc_auc_score(Y_test, y_pred)
plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {auc_score:.2f})', color=color)
# Plot the diagonal line (no-skill line)
plt.plot([0, 1], [0, 1], linestyle='--', color='red', lw=2)
# Add legend, labels, and title
plt.legend(loc='lower right')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.savefig('Combined_ROC_Curve.png')
plt.show()
# Define the results dictionary
results = {
'Model': ['Logistic Regression', 'Random Forest', 'KNN', 'Naive Bayes'],
'Accuracy %': [accuracy_score(Y_test, y_pred_best), accuracy_score(Y_test, y_pred_best_rf), accuracy_score(Y_test, best_k_predictions), accuracy_score(Y_test, optimised_gnb_Y_predict)],
'Precision %': [precision_score(Y_test, y_pred_best), precision_score(Y_test, y_pred_best_rf), precision_score(Y_test, best_k_predictions), precision_score(Y_test, optimised_gnb_Y_predict)],
'Recall %': [recall_score(Y_test, y_pred_best), recall_score(Y_test, y_pred_best_rf), recall_score(Y_test, best_k_predictions), recall_score(Y_test, optimised_gnb_Y_predict)],
'F1-Score': [f"{f1_score(Y_test, y_pred_best):.2f}", f"{f1_score(Y_test, y_pred_best_rf):.2f}", f"{f1_score(Y_test, best_k_predictions):.2f}", f"{f1_score(Y_test, optimised_gnb_Y_predict):.2f}"]
}
# Convert accuracy, precision, recall scores to percentages and F1 scores to strings with 2 decimal places
results['Accuracy %'] = [f"{score * 100:.2f}%" for score in results['Accuracy %']]
results['Precision %'] = [f"{score * 100:.2f}%" for score in results['Precision %']]
results['Recall %'] = [f"{score * 100:.2f}%" for score in results['Recall %']]
# Create a DataFrame
results_df = pd.DataFrame(results)
# Display the DataFrame
print(results_df)
# Plot the DataFrame as a table with increased font size
fig, ax = plt.subplots(figsize=(14, 8))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=results_df.values, colLabels=results_df.columns, loc='center')
# Adjust font size
table.auto_set_font_size(False)
table.set_fontsize(10)
# Save the table as an image
plt.savefig('performance_metrics_table.jpg')
plt.show()
Model Accuracy % Precision % Recall % F1-Score 0 Logistic Regression 82.30% 85.43% 77.89% 0.81 1 Random Forest 82.77% 82.17% 83.70% 0.83 2 KNN 76.13% 75.07% 78.25% 0.77 3 Naive Bayes 79.83% 80.71% 78.39% 0.80
# Define data
models = ['Logistic Regression', 'Random Forest', 'KNN', 'Naive Bayes']
y_true = [Y_test, Y_test, Y_test, Y_test] # Replace with your true labels
y_pred = [y_pred_best, y_pred_best_rf, best_k_predictions, optimised_gnb_Y_predict] # Replace with your predicted labels
# Calculate precision, recall, and F1-score
precision_scores = [precision_score(true, pred) for true, pred in zip(y_true, y_pred)]
recall_scores = [recall_score(true, pred) for true, pred in zip(y_true, y_pred)]
f1_scores = [f1_score(true, pred) for true, pred in zip(y_true, y_pred)]
# Set the width of the bars
bar_width = 0.25
# Set the positions of the bars on the x-axis
r1 = np.arange(len(models))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
# Plot grouped bar chart with custom colors
plt.bar(r1, precision_scores, color='skyblue', width=bar_width, edgecolor='grey', label='Precision')
plt.bar(r2, recall_scores, color='green', width=bar_width, edgecolor='grey', label='Recall')
plt.bar(r3, f1_scores, color='red', width=bar_width, edgecolor='grey', label='F1-Score')
# Add xticks on the middle of the group bars
plt.xlabel('Models', fontweight='bold')
plt.xticks([r + bar_width for r in range(len(models))], models, rotation=45)
# Add labels and title
plt.ylabel('Scores', fontweight='bold')
plt.title('Precision, Recall, and F1-Score by Model')
# Add legend
plt.legend()
# Show plot
plt.tight_layout()
plt.savefig('PRF.jpg')
plt.show()